Merge branch 'master' into retopo_transform

Cleanup: Replace reinterpret_cast<> with static_cast<> in UI code
Fix attempt for MSVC build error after 42ccbb7cd1
2022-07-29 13:51:49 -04:00 · 2022-07-29 18:45:12 +02:00 · 2022-07-29 18:10:26 +02:00 · 2022-07-29 18:00:50 +02:00 · 2022-07-29 17:54:32 +02:00 · 2022-07-29 16:56:48 +02:00
614 changed files with 15438 additions and 10411 deletions
--- a/build_files/build_environment/CMakeLists.txt
+++ b/build_files/build_environment/CMakeLists.txt
@@ -139,6 +139,7 @@ if(NOT WIN32 OR ENABLE_MINGW64)
    include(cmake/vpx.cmake)
    include(cmake/x264.cmake)
    include(cmake/xvidcore.cmake)
+    include(cmake/aom.cmake)
    include(cmake/ffmpeg.cmake)
    include(cmake/fftw.cmake)
    include(cmake/sndfile.cmake)
--- a/build_files/build_environment/cmake/alembic.cmake
+++ b/build_files/build_environment/cmake/alembic.cmake
@@ -42,4 +42,5 @@ endif()
 add_dependencies(
  external_alembic
  external_openexr
+  external_imath
 )
--- a/build_files/build_environment/cmake/aom.cmake
+++ b/build_files/build_environment/cmake/aom.cmake
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+if(WIN32)
+  # The default generator on windows is msbuild, which we do not
+  # want to use for this dep, as needs to build with mingw
+  set(AOM_GENERATOR "Ninja")
+  # The default flags are full of MSVC options given this will be
+  # building with mingw, it'll have an unhappy time with that and
+  # we need to clear them out.
+  set(AOM_CMAKE_FLAGS )
+  # CMake will correctly identify phreads being available, however
+  # we do not want to use them, as that gains a dependency on
+  # libpthreadswin.dll which we do not want. when pthreads is not
+  # available oam will use a pthreads emulation layer using win32 threads
+  set(AOM_EXTRA_ARGS_WIN32 -DCMAKE_HAVE_PTHREAD_H=OFF)
+else()
+  set(AOM_GENERATOR "Unix Makefiles")
+  set(AOM_CMAKE_FLAGS ${DEFAULT_CMAKE_FLAGS})
+endif()
+
+set(AOM_EXTRA_ARGS
+  -DENABLE_TESTDATA=OFF
+  -DENABLE_TESTS=OFF
+  -DENABLE_TOOLS=OFF
+  -DENABLE_EXAMPLES=OFF
+  ${AOM_EXTRA_ARGS_WIN32}
+)
+
+# This is slightly different from all other deps in the way that
+# aom uses cmake as a build system, but still needs the environment setup
+# to include perl so we manually setup the environment and call
+# cmake directly for the configure, build and install commands.
+
+ExternalProject_Add(external_aom
+  URL file://${PACKAGE_DIR}/${AOM_FILE}
+  DOWNLOAD_DIR ${DOWNLOAD_DIR}
+  URL_HASH ${AOM_HASH_TYPE}=${AOM_HASH}
+  PREFIX ${BUILD_DIR}/aom
+  CONFIGURE_COMMAND ${CONFIGURE_ENV} &&
+    cd ${BUILD_DIR}/aom/src/external_aom-build/ &&
+    ${CMAKE_COMMAND} -G "${AOM_GENERATOR}" -DCMAKE_INSTALL_PREFIX=${LIBDIR}/aom ${AOM_CMAKE_FLAGS} ${AOM_EXTRA_ARGS} ${BUILD_DIR}/aom/src/external_aom/
+  BUILD_COMMAND ${CMAKE_COMMAND} --build .
+  INSTALL_COMMAND ${CMAKE_COMMAND} --build . --target install
+  INSTALL_DIR ${LIBDIR}/aom
+)
--- a/build_files/build_environment/cmake/download.cmake
+++ b/build_files/build_environment/cmake/download.cmake
@@ -116,3 +116,4 @@ download_source(IGC_SPIRV_TOOLS)
 download_source(IGC_SPIRV_TRANSLATOR)
 download_source(GMMLIB)
 download_source(OCLOC)
+download_source(AOM)
--- a/build_files/build_environment/cmake/ffmpeg.cmake
+++ b/build_files/build_environment/cmake/ffmpeg.cmake
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: GPL-2.0-or-later

-set(FFMPEG_CFLAGS "-I${mingw_LIBDIR}/lame/include -I${mingw_LIBDIR}/openjpeg/include/ -I${mingw_LIBDIR}/ogg/include -I${mingw_LIBDIR}/vorbis/include -I${mingw_LIBDIR}/theora/include -I${mingw_LIBDIR}/opus/include -I${mingw_LIBDIR}/vpx/include -I${mingw_LIBDIR}/x264/include -I${mingw_LIBDIR}/xvidcore/include -I${mingw_LIBDIR}/zlib/include")
-set(FFMPEG_LDFLAGS "-L${mingw_LIBDIR}/lame/lib -L${mingw_LIBDIR}/openjpeg/lib -L${mingw_LIBDIR}/ogg/lib -L${mingw_LIBDIR}/vorbis/lib -L${mingw_LIBDIR}/theora/lib -L${mingw_LIBDIR}/opus/lib -L${mingw_LIBDIR}/vpx/lib -L${mingw_LIBDIR}/x264/lib -L${mingw_LIBDIR}/xvidcore/lib -L${mingw_LIBDIR}/zlib/lib")
+set(FFMPEG_CFLAGS "-I${mingw_LIBDIR}/lame/include -I${mingw_LIBDIR}/openjpeg/include/ -I${mingw_LIBDIR}/ogg/include -I${mingw_LIBDIR}/vorbis/include -I${mingw_LIBDIR}/theora/include -I${mingw_LIBDIR}/opus/include -I${mingw_LIBDIR}/vpx/include -I${mingw_LIBDIR}/x264/include -I${mingw_LIBDIR}/xvidcore/include -I${mingw_LIBDIR}/zlib/include -I${mingw_LIBDIR}/aom/include")
+set(FFMPEG_LDFLAGS "-L${mingw_LIBDIR}/lame/lib -L${mingw_LIBDIR}/openjpeg/lib -L${mingw_LIBDIR}/ogg/lib -L${mingw_LIBDIR}/vorbis/lib -L${mingw_LIBDIR}/theora/lib -L${mingw_LIBDIR}/opus/lib -L${mingw_LIBDIR}/vpx/lib -L${mingw_LIBDIR}/x264/lib -L${mingw_LIBDIR}/xvidcore/lib -L${mingw_LIBDIR}/zlib/lib -L${mingw_LIBDIR}/aom/lib")
 set(FFMPEG_EXTRA_FLAGS --pkg-config-flags=--static --extra-cflags=${FFMPEG_CFLAGS} --extra-ldflags=${FFMPEG_LDFLAGS})
-set(FFMPEG_ENV PKG_CONFIG_PATH=${mingw_LIBDIR}/openjpeg/lib/pkgconfig:${mingw_LIBDIR}/x264/lib/pkgconfig:${mingw_LIBDIR}/vorbis/lib/pkgconfig:${mingw_LIBDIR}/ogg/lib/pkgconfig:${mingw_LIBDIR}:${mingw_LIBDIR}/vpx/lib/pkgconfig:${mingw_LIBDIR}/theora/lib/pkgconfig:${mingw_LIBDIR}/openjpeg/lib/pkgconfig:${mingw_LIBDIR}/opus/lib/pkgconfig:)
+set(FFMPEG_ENV PKG_CONFIG_PATH=${mingw_LIBDIR}/openjpeg/lib/pkgconfig:${mingw_LIBDIR}/x264/lib/pkgconfig:${mingw_LIBDIR}/vorbis/lib/pkgconfig:${mingw_LIBDIR}/ogg/lib/pkgconfig:${mingw_LIBDIR}:${mingw_LIBDIR}/vpx/lib/pkgconfig:${mingw_LIBDIR}/theora/lib/pkgconfig:${mingw_LIBDIR}/openjpeg/lib/pkgconfig:${mingw_LIBDIR}/opus/lib/pkgconfig:${mingw_LIBDIR}/aom/lib/pkgconfig:)

 if(WIN32)
  set(FFMPEG_ENV set ${FFMPEG_ENV} &&)
@@ -79,6 +79,7 @@ ExternalProject_Add(external_ffmpeg
    --disable-librtmp
    --enable-libx264
    --enable-libxvid
+    --enable-libaom
    --disable-libopencore-amrnb
    --disable-libopencore-amrwb
    --disable-libdc1394
@@ -125,6 +126,7 @@ add_dependencies(
  external_vorbis
  external_ogg
  external_lame
+  external_aom
 )
 if(WIN32)
  add_dependencies(
--- a/build_files/build_environment/cmake/flex.cmake
+++ b/build_files/build_environment/cmake/flex.cmake
@@ -5,8 +5,6 @@ ExternalProject_Add(external_flex
  URL_HASH ${FLEX_HASH_TYPE}=${FLEX_HASH}
  DOWNLOAD_DIR ${DOWNLOAD_DIR}
  PREFIX ${BUILD_DIR}/flex
-  # This patch fixes build with some versions of glibc (https://github.com/westes/flex/commit/24fd0551333e7eded87b64dd36062da3df2f6380)
-  PATCH_COMMAND ${PATCH_CMD} -d ${BUILD_DIR}/flex/src/external_flex < ${PATCH_DIR}/flex.diff
  CONFIGURE_COMMAND ${CONFIGURE_ENV} && cd ${BUILD_DIR}/flex/src/external_flex/ && ${CONFIGURE_COMMAND} --prefix=${LIBDIR}/flex
  BUILD_COMMAND ${CONFIGURE_ENV} && cd ${BUILD_DIR}/flex/src/external_flex/ && make -j${MAKE_THREADS}
  INSTALL_COMMAND ${CONFIGURE_ENV} && cd ${BUILD_DIR}/flex/src/external_flex/ && make install
--- a/build_files/build_environment/cmake/harvest.cmake
+++ b/build_files/build_environment/cmake/harvest.cmake
@@ -25,9 +25,6 @@ if(BUILD_MODE STREQUAL Release)
        # glew-> opengl
        ${CMAKE_COMMAND} -E copy ${LIBDIR}/glew/lib/libglew32.lib ${HARVEST_TARGET}/opengl/lib/glew.lib &&
        ${CMAKE_COMMAND} -E copy_directory ${LIBDIR}/glew/include/ ${HARVEST_TARGET}/opengl/include/ &&
-        # tiff
-        ${CMAKE_COMMAND} -E copy ${LIBDIR}/tiff/lib/tiff.lib ${HARVEST_TARGET}/tiff/lib/libtiff.lib &&
-        ${CMAKE_COMMAND} -E copy_directory ${LIBDIR}/tiff/include/ ${HARVEST_TARGET}/tiff/include/
    DEPENDS
  )
 endif()
@@ -177,6 +174,7 @@ harvest(opus/lib ffmpeg/lib "*.a")
 harvest(vpx/lib ffmpeg/lib "*.a")
 harvest(x264/lib ffmpeg/lib "*.a")
 harvest(xvidcore/lib ffmpeg/lib "*.a")
+harvest(aom/lib ffmpeg/lib "*.a")
 harvest(webp/lib webp/lib "*.a")
 harvest(webp/include webp/include "*.h")
 harvest(usd/include usd/include "*.h")
--- a/build_files/build_environment/cmake/openimageio.cmake
+++ b/build_files/build_environment/cmake/openimageio.cmake
@@ -18,9 +18,15 @@ if(WIN32)
  set(PNG_LIBNAME libpng16_static${LIBEXT})
  set(OIIO_SIMD_FLAGS -DUSE_SIMD=sse2)
  set(OPENJPEG_POSTFIX _msvc)
+  if(BUILD_MODE STREQUAL Debug)
+    set(TIFF_POSTFIX d)
+  else()
+    set(TIFF_POSTFIX)
+  endif()
 else()
  set(PNG_LIBNAME libpng${LIBEXT})
  set(OIIO_SIMD_FLAGS)
+  set(TIFF_POSTFIX)
 endif()

 if(MSVC)
@@ -65,7 +71,7 @@ set(OPENIMAGEIO_EXTRA_ARGS
  -DZLIB_INCLUDE_DIR=${LIBDIR}/zlib/include
  -DPNG_LIBRARY=${LIBDIR}/png/lib/${PNG_LIBNAME}
  -DPNG_PNG_INCLUDE_DIR=${LIBDIR}/png/include
-  -DTIFF_LIBRARY=${LIBDIR}/tiff/lib/${LIBPREFIX}tiff${LIBEXT}
+  -DTIFF_LIBRARY=${LIBDIR}/tiff/lib/${LIBPREFIX}tiff${TIFF_POSTFIX}${LIBEXT}
  -DTIFF_INCLUDE_DIR=${LIBDIR}/tiff/include
  -DJPEG_LIBRARY=${LIBDIR}/jpeg/lib/${JPEG_LIBRARY}
  -DJPEG_INCLUDE_DIR=${LIBDIR}/jpeg/include
--- a/build_files/build_environment/cmake/tiff.cmake
+++ b/build_files/build_environment/cmake/tiff.cmake
@@ -3,6 +3,8 @@
 set(TIFF_EXTRA_ARGS
  -DZLIB_LIBRARY=${LIBDIR}/zlib/lib/${ZLIB_LIBRARY}
  -DZLIB_INCLUDE_DIR=${LIBDIR}/zlib/include
+  -DJPEG_LIBRARY=${LIBDIR}/jpeg/lib/${JPEG_LIBRARY}
+  -DJPEG_INCLUDE_DIR=${LIBDIR}/jpeg/include
  -DPNG_STATIC=ON
  -DBUILD_SHARED_LIBS=OFF
  -Dlzma=OFF
@@ -24,10 +26,12 @@ add_dependencies(
  external_tiff
  external_zlib
 )
-
-if(WIN32 AND BUILD_MODE STREQUAL Debug)
-  ExternalProject_Add_Step(external_tiff after_install
-    COMMAND ${CMAKE_COMMAND} -E copy ${LIBDIR}/tiff/lib/tiffd${LIBEXT} ${LIBDIR}/tiff/lib/tiff${LIBEXT}
-    DEPENDEES install
-  )
+if(WIN32)
+  if(BUILD_MODE STREQUAL Release)
+    ExternalProject_Add_Step(external_tiff after_install
+      COMMAND ${CMAKE_COMMAND} -E copy ${LIBDIR}/tiff/lib/tiff.lib ${HARVEST_TARGET}/tiff/lib/libtiff.lib &&
+              ${CMAKE_COMMAND} -E copy_directory ${LIBDIR}/tiff/include/ ${HARVEST_TARGET}/tiff/include/
+      DEPENDEES install
+    )
+  endif()
 endif()
--- a/build_files/build_environment/cmake/versions.cmake
+++ b/build_files/build_environment/cmake/versions.cmake
@@ -45,15 +45,15 @@ set(PTHREADS_HASH f3bf81bb395840b3446197bcf4ecd653)
 set(PTHREADS_HASH_TYPE MD5)
 set(PTHREADS_FILE pthreads4w-code-${PTHREADS_VERSION}.zip)

-set(OPENEXR_VERSION 3.1.4)
+set(OPENEXR_VERSION 3.1.5)
 set(OPENEXR_URI https://github.com/AcademySoftwareFoundation/openexr/archive/v${OPENEXR_VERSION}.tar.gz)
-set(OPENEXR_HASH e990be1ff765797bc2d93a8060e1c1f2)
+set(OPENEXR_HASH a92f38eedd43e56c0af56d4852506886)
 set(OPENEXR_HASH_TYPE MD5)
 set(OPENEXR_FILE openexr-${OPENEXR_VERSION}.tar.gz)

-set(IMATH_VERSION 3.1.4)
+set(IMATH_VERSION 3.1.5)
 set(IMATH_URI https://github.com/AcademySoftwareFoundation/Imath/archive/v${OPENEXR_VERSION}.tar.gz)
-set(IMATH_HASH fddf14ec73e12c34e74c3c175e311a3f)
+set(IMATH_HASH dd375574276c54872b7b3d54053baff0)
 set(IMATH_HASH_TYPE MD5)
 set(IMATH_FILE imath-${IMATH_VERSION}.tar.gz)

@@ -163,9 +163,9 @@ set(ROBINMAP_HASH c08ec4b1bf1c85eb0d6432244a6a89862229da1cb834f3f90fba8dc35d8c8e
 set(ROBINMAP_HASH_TYPE SHA256)
 set(ROBINMAP_FILE robinmap-${ROBINMAP_VERSION}.tar.gz)

-set(TIFF_VERSION 4.3.0)
+set(TIFF_VERSION 4.4.0)
 set(TIFF_URI http://download.osgeo.org/libtiff/tiff-${TIFF_VERSION}.tar.gz)
-set(TIFF_HASH 0a2e4744d1426a8fc8211c0cdbc3a1b3)
+set(TIFF_HASH 376f17f189e9d02280dfe709b2b2bbea)
 set(TIFF_HASH_TYPE MD5)
 set(TIFF_FILE tiff-${TIFF_VERSION}.tar.gz)

@@ -633,3 +633,9 @@ set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC
 set(OCLOC_HASH ab22b8bf2560a57fdd3def0e35a62ca75991406f959c0263abb00cd6cd9ae998)
 set(OCLOC_HASH_TYPE SHA256)
 set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)
+
+set(AOM_VERSION 3.4.0)
+set(AOM_URI https://storage.googleapis.com/aom-releases/libaom-${AOM_VERSION}.tar.gz)
+set(AOM_HASH bd754b58c3fa69f3ffd29da77de591bd9c26970e3b18537951336d6c0252e354)
+set(AOM_HASH_TYPE SHA256)
+set(AOM_FILE libaom-${AOM_VERSION}.tar.gz)
--- a/build_files/build_environment/cmake/vpx.cmake
+++ b/build_files/build_environment/cmake/vpx.cmake
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: GPL-2.0-or-later

 if(WIN32)
-  if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
-    set(VPX_EXTRA_FLAGS --target=x86_64-win64-gcc --disable-multithread)
-  else()
-    set(VPX_EXTRA_FLAGS --target=x86-win32-gcc --disable-multithread)
-  endif()
+  # VPX is determined to use pthreads which it will tell ffmpeg to dynamically
+  # link, which is not something we're super into distribution wise. However
+  # if it cannot find pthread.h it'll happily provide a pthread emulation
+  # layer using win32 threads. So all this patch does is make it not find
+  # pthead.h
+  set(VPX_PATCH ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/vpx/src/external_vpx < ${PATCH_DIR}/vpx_windows.diff)
+  set(VPX_EXTRA_FLAGS --target=x86_64-win64-gcc )
 else()
  if(APPLE)
    if("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64")
@@ -18,6 +20,16 @@ else()
  endif()
 endif()

+if(NOT BLENDER_PLATFORM_ARM)
+  list(APPEND VPX_EXTRA_FLAGS
+    --enable-sse4_1
+    --enable-sse3
+    --enable-ssse3
+    --enable-avx
+    --enable-avx2
+  )
+endif()
+
 ExternalProject_Add(external_vpx
  URL file://${PACKAGE_DIR}/${VPX_FILE}
  DOWNLOAD_DIR ${DOWNLOAD_DIR}
@@ -30,11 +42,6 @@ ExternalProject_Add(external_vpx
      --enable-static
      --disable-install-bins
      --disable-install-srcs
-      --disable-sse4_1
-      --disable-sse3
-      --disable-ssse3
-      --disable-avx
-      --disable-avx2
      --disable-unit-tests
      --disable-examples
      --enable-vp8
@@ -42,6 +49,7 @@ ExternalProject_Add(external_vpx
      ${VPX_EXTRA_FLAGS}
  BUILD_COMMAND ${CONFIGURE_ENV} && cd ${BUILD_DIR}/vpx/src/external_vpx/ && make -j${MAKE_THREADS}
  INSTALL_COMMAND ${CONFIGURE_ENV} && cd ${BUILD_DIR}/vpx/src/external_vpx/ && make install
+  PATCH_COMMAND ${VPX_PATCH}
  INSTALL_DIR ${LIBDIR}/vpx
 )

--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -478,7 +478,7 @@ OCIO_FORCE_BUILD=false
 OCIO_FORCE_REBUILD=false
 OCIO_SKIP=false

-IMATH_VERSION="3.1.4"
+IMATH_VERSION="3.1.5"
 IMATH_VERSION_SHORT="3.1"
 IMATH_VERSION_MIN="3.0"
 IMATH_VERSION_MEX="4.0"
@@ -487,7 +487,7 @@ IMATH_FORCE_REBUILD=false
 IMATH_SKIP=false
 _with_built_imath=false

-OPENEXR_VERSION="3.1.4"
+OPENEXR_VERSION="3.1.5"
 OPENEXR_VERSION_SHORT="3.1"
 OPENEXR_VERSION_MIN="3.0"
 OPENEXR_VERSION_MEX="4.0"
@@ -627,6 +627,9 @@ WEBP_DEV=""
 VPX_USE=false
 VPX_VERSION_MIN=0.9.7
 VPX_DEV=""
+AOM_USE=false
+AOM_VERSION_MIN=3.3.0
+AOM_DEV=""
 OPUS_USE=false
 OPUS_VERSION_MIN=1.1.1
 OPUS_DEV=""
@@ -1209,7 +1212,7 @@ You may also want to build them yourself (optional ones are [between brackets]):
    ** [NumPy $PYTHON_NUMPY_VERSION] (use pip).
    * Boost $BOOST_VERSION (from $BOOST_SOURCE, modules: $BOOST_BUILD_MODULES).
    * TBB $TBB_VERSION (from $TBB_SOURCE).
-    * [FFMpeg $FFMPEG_VERSION (needs libvorbis, libogg, libtheora, libx264, libmp3lame, libxvidcore, libvpx, libwebp, ...)] (from $FFMPEG_SOURCE).
+    * [FFMpeg $FFMPEG_VERSION (needs libvorbis, libogg, libtheora, libx264, libmp3lame, libxvidcore, libvpx, libaom, libwebp, ...)] (from $FFMPEG_SOURCE).
    * [OpenColorIO $OCIO_VERSION] (from $OCIO_SOURCE).
    * Imath $IMATH_VERSION (from $IMATH_SOURCE).
    * OpenEXR $OPENEXR_VERSION (from $OPENEXR_SOURCE).
@@ -3000,7 +3003,7 @@ compile_ALEMBIC() {
  fi

  # To be changed each time we make edits that would modify the compiled result!
-  alembic_magic=2
+  alembic_magic=3
  _init_alembic

  # Force having own builds for the dependencies.
@@ -3048,7 +3051,7 @@ compile_ALEMBIC() {
    fi
    if [ "$_with_built_openexr" = true ]; then
      cmake_d="$cmake_d -D USE_ARNOLD=OFF"
-      cmake_d="$cmake_d -D USE_BINARIES=OFF"
+      cmake_d="$cmake_d -D USE_BINARIES=ON"  # Tests use some Alembic binaries...
      cmake_d="$cmake_d -D USE_EXAMPLES=OFF"
      cmake_d="$cmake_d -D USE_HDF5=OFF"
      cmake_d="$cmake_d -D USE_MAYA=OFF"
@@ -3634,7 +3637,7 @@ compile_FFmpeg() {
  fi

  # To be changed each time we make edits that would modify the compiled result!
-  ffmpeg_magic=5
+  ffmpeg_magic=6
  _init_ffmpeg

  # Force having own builds for the dependencies.
@@ -3687,6 +3690,10 @@ compile_FFmpeg() {
      extra="$extra --enable-libvpx"
    fi

+    if [ "$AOM_USE" = true ]; then
+      extra="$extra --enable-libaom"
+    fi
+
    if [ "$WEBP_USE" = true ]; then
      extra="$extra --enable-libwebp"
    fi
@@ -4140,30 +4147,34 @@ install_DEB() {
    WEBP_USE=true
  fi

-  if [ "$WITH_ALL" = true ]; then
-    XVID_DEV="libxvidcore-dev"
-    check_package_DEB $XVID_DEV
-    if [ $? -eq 0 ]; then
-      XVID_USE=true
-    fi
+  XVID_DEV="libxvidcore-dev"
+  check_package_DEB $XVID_DEV
+  if [ $? -eq 0 ]; then
+    XVID_USE=true
+  fi

-    MP3LAME_DEV="libmp3lame-dev"
-    check_package_DEB $MP3LAME_DEV
-    if [ $? -eq 0 ]; then
-      MP3LAME_USE=true
-    fi
+  MP3LAME_DEV="libmp3lame-dev"
+  check_package_DEB $MP3LAME_DEV
+  if [ $? -eq 0 ]; then
+    MP3LAME_USE=true
+  fi

-    VPX_DEV="libvpx-dev"
-    check_package_version_ge_DEB $VPX_DEV $VPX_VERSION_MIN
-    if [ $? -eq 0 ]; then
-      VPX_USE=true
-    fi
+  VPX_DEV="libvpx-dev"
+  check_package_version_ge_DEB $VPX_DEV $VPX_VERSION_MIN
+  if [ $? -eq 0 ]; then
+    VPX_USE=true
+  fi

-    OPUS_DEV="libopus-dev"
-    check_package_version_ge_DEB $OPUS_DEV $OPUS_VERSION_MIN
-    if [ $? -eq 0 ]; then
-      OPUS_USE=true
-    fi
+  AOM_DEV="libaom-dev"
+  check_package_version_ge_DEB $AOM_DEV $AOM_VERSION_MIN
+  if [ $? -eq 0 ]; then
+    AOM_USE=true
+  fi
+
+  OPUS_DEV="libopus-dev"
+  check_package_version_ge_DEB $OPUS_DEV $OPUS_VERSION_MIN
+  if [ $? -eq 0 ]; then
+    OPUS_USE=true
  fi

  # Check cmake version and disable features for older distros.
@@ -4546,6 +4557,9 @@ install_DEB() {
    if [ "$VPX_USE" = true ]; then
      _packages="$_packages $VPX_DEV"
    fi
+    if [ "$AOM_USE" = true ]; then
+      _packages="$_packages $AOM_DEV"
+    fi
    if [ "$OPUS_USE" = true ]; then
      _packages="$_packages $OPUS_DEV"
    fi
@@ -4846,21 +4860,27 @@ install_RPM() {
    WEBP_USE=true
  fi

-  if [ "$WITH_ALL" = true ]; then
-    VPX_DEV="libvpx-devel"
-    check_package_version_ge_RPM $VPX_DEV $VPX_VERSION_MIN
-    if [ $? -eq 0 ]; then
-      VPX_USE=true
-    fi
+  VPX_DEV="libvpx-devel"
+  check_package_version_ge_RPM $VPX_DEV $VPX_VERSION_MIN
+  if [ $? -eq 0 ]; then
+    VPX_USE=true
+  fi

+  AOM_DEV="libaom-devel"
+  check_package_version_ge_RPM $AOM_DEV $AOM_VERSION_MIN
+  if [ $? -eq 0 ]; then
+    AOM_USE=true
+  fi
+
+  OPUS_DEV="libopus-devel"
+  check_package_version_ge_RPM $OPUS_DEV $OPUS_VERSION_MIN
+  if [ $? -eq 0 ]; then
+    OPUS_USE=true
+  fi
+
+  if [ "$WITH_ALL" = true ]; then
    PRINT ""
    install_packages_RPM libspnav-devel
-
-    OPUS_DEV="libopus-devel"
-    check_package_version_ge_RPM $OPUS_DEV $OPUS_VERSION_MIN
-    if [ $? -eq 0 ]; then
-      OPUS_USE=true
-    fi
  fi

  PRINT ""
@@ -5245,6 +5265,9 @@ install_RPM() {
    if [ "$VPX_USE" = true ]; then
      _packages="$_packages $VPX_DEV"
    fi
+    if [ "$AOM_USE" = true ]; then
+      _packages="$_packages $AOM_DEV"
+    fi
    if [ "$OPUS_USE" = true ]; then
      _packages="$_packages $OPUS_DEV"
    fi
@@ -5434,30 +5457,34 @@ install_ARCH() {
    WEBP_USE=true
  fi

-  if [ "$WITH_ALL" = true ]; then
-    XVID_DEV="xvidcore"
-    check_package_ARCH $XVID_DEV
-    if [ $? -eq 0 ]; then
-      XVID_USE=true
-    fi
+  XVID_DEV="xvidcore"
+  check_package_ARCH $XVID_DEV
+  if [ $? -eq 0 ]; then
+    XVID_USE=true
+  fi

-    MP3LAME_DEV="lame"
-    check_package_ARCH $MP3LAME_DEV
-    if [ $? -eq 0 ]; then
-      MP3LAME_USE=true
-    fi
+  MP3LAME_DEV="lame"
+  check_package_ARCH $MP3LAME_DEV
+  if [ $? -eq 0 ]; then
+    MP3LAME_USE=true
+  fi

-    VPX_DEV="libvpx"
-    check_package_version_ge_ARCH $VPX_DEV $VPX_VERSION_MIN
-    if [ $? -eq 0 ]; then
-      VPX_USE=true
-    fi
+  VPX_DEV="libvpx"
+  check_package_version_ge_ARCH $VPX_DEV $VPX_VERSION_MIN
+  if [ $? -eq 0 ]; then
+    VPX_USE=true
+  fi

-    OPUS_DEV="opus"
-    check_package_version_ge_ARCH $OPUS_DEV $OPUS_VERSION_MIN
-    if [ $? -eq 0 ]; then
-      OPUS_USE=true
-    fi
+  AOM_DEV="libaom"
+  check_package_version_ge_ARCH $AOM_DEV $AOM_VERSION_MIN
+  if [ $? -eq 0 ]; then
+    AOM_USE=true
+  fi
+
+  OPUS_DEV="opus"
+  check_package_version_ge_ARCH $OPUS_DEV $OPUS_VERSION_MIN
+  if [ $? -eq 0 ]; then
+    OPUS_USE=true
  fi


@@ -5835,6 +5862,9 @@ install_ARCH() {
    if [ "$VPX_USE" = true ]; then
      _packages="$_packages $VPX_DEV"
    fi
+    if [ "$AOM_USE" = true ]; then
+      _packages="$_packages $AOM_DEV"
+    fi
    if [ "$OPUS_USE" = true ]; then
      _packages="$_packages $OPUS_DEV"
    fi
--- a/build_files/build_environment/patches/flex.diff
+++ b/build_files/build_environment/patches/flex.diff
@@ -1,15 +0,0 @@
-diff --git a/configure.ac b/configure.ac
-index c6f12d644..3c977a4e3 100644
--- a/configure.ac
-+++ b/configure.ac
-@@ -25,8 +25,10 @@
- # autoconf requirements and initialization
- 
- AC_INIT([the fast lexical analyser generator],[2.6.4],[flex-help@lists.sourceforge.net],[flex])
-+AC_PREREQ([2.60])
- AC_CONFIG_SRCDIR([src/scan.l])
- AC_CONFIG_AUX_DIR([build-aux])
-+AC_USE_SYSTEM_EXTENSIONS
- LT_INIT
- AM_INIT_AUTOMAKE([1.15 -Wno-portability foreign std-options dist-lzip parallel-tests subdir-objects])
- AC_CONFIG_HEADER([src/config.h])
--- a/build_files/build_environment/patches/vpx_windows.diff
+++ b/build_files/build_environment/patches/vpx_windows.diff
@@ -0,0 +1,11 @@
+diff -Naur orig/configure external_vpx/configure
+--- orig/configure	2022-07-06 09:22:04 -0600
+++ external_vpx/configure	2022-07-06 09:24:12 -0600
+@@ -270,7 +270,6 @@
+ HAVE_LIST="
+     ${ARCH_EXT_LIST}
+     vpx_ports
+-    pthread_h
+     unistd_h
+ "
+ EXPERIMENT_LIST="
--- a/build_files/cmake/config/blender_release.cmake
+++ b/build_files/cmake/config/blender_release.cmake
@@ -78,11 +78,6 @@ if(UNIX AND NOT APPLE)
  set(WITH_PULSEAUDIO          ON  CACHE BOOL "" FORCE)
  set(WITH_X11_XINPUT          ON  CACHE BOOL "" FORCE)
  set(WITH_X11_XF86VMODE       ON  CACHE BOOL "" FORCE)
-
-  # Disable oneAPI on Linux for the time being.
-  # The AoT compilation takes too long to be used officially in the buildbot CI/CD and the JIT
-  # compilation has ABI compatibility issues when running builds made on centOS on Ubuntu.
-  set(WITH_CYCLES_DEVICE_ONEAPI OFF  CACHE BOOL "" FORCE)
 endif()
 if(NOT APPLE)
  set(WITH_XR_OPENXR              ON  CACHE BOOL "" FORCE)
@@ -93,6 +88,6 @@ if(NOT APPLE)
  set(WITH_CYCLES_HIP_BINARIES    ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_DEVICE_ONEAPI   ON  CACHE BOOL "" FORCE)

-  # Disable AoT kernels compilations until buildbot can deliver them in a reasonabel time.
+  # Disable AoT kernels compilations until buildbot can deliver them in a reasonable time.
  set(WITH_CYCLES_ONEAPI_BINARIES OFF CACHE BOOL "" FORCE)
 endif()
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -162,6 +162,9 @@ if(WITH_CODEC_FFMPEG)
    mp3lame ogg opus swresample swscale
    theora theoradec theoraenc vorbis vorbisenc
    vorbisfile vpx x264 xvidcore)
+  if(EXISTS ${LIBDIR}/ffmpeg/lib/libaom.a)
+    list(APPEND FFMPEG_FIND_COMPONENTS aom)
+  endif()
  find_package(FFmpeg)
 endif()

@@ -467,8 +470,9 @@ string(APPEND CMAKE_CXX_FLAGS " -ftemplate-depth=1024")

 # Avoid conflicts with Luxrender, and other plug-ins that may use the same
 # libraries as Blender with a different version or build options.
+set(PLATFORM_SYMBOLS_MAP ${CMAKE_SOURCE_DIR}/source/creator/symbols_apple.map)
 string(APPEND PLATFORM_LINKFLAGS
-  " -Wl,-unexported_symbols_list,'${CMAKE_SOURCE_DIR}/source/creator/osx_locals.map'"
+  " -Wl,-unexported_symbols_list,'${PLATFORM_SYMBOLS_MAP}'"
 )

 string(APPEND CMAKE_CXX_FLAGS " -stdlib=libc++")
--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@@ -202,6 +202,9 @@ if(WITH_CODEC_FFMPEG)
      vpx
      x264
      xvidcore)
+    if(EXISTS ${LIBDIR}/ffmpeg/lib/libaom.a)
+      list(APPEND FFMPEG_FIND_COMPONENTS aom)
+    endif()
  elseif(FFMPEG)
    # Old cache variable used for root dir, convert to new standard.
    set(FFMPEG_ROOT_DIR ${FFMPEG})
@@ -885,8 +888,9 @@ unset(_IS_LINKER_DEFAULT)

 # Avoid conflicts with Mesa llvmpipe, Luxrender, and other plug-ins that may
 # use the same libraries as Blender with a different version or build options.
+set(PLATFORM_SYMBOLS_MAP ${CMAKE_SOURCE_DIR}/source/creator/symbols_unix.map)
 set(PLATFORM_LINKFLAGS
-  "${PLATFORM_LINKFLAGS} -Wl,--version-script='${CMAKE_SOURCE_DIR}/source/creator/blender.map'"
+  "${PLATFORM_LINKFLAGS} -Wl,--version-script='${PLATFORM_SYMBOLS_MAP}'"
 )

 # Don't use position independent executable for portable install since file
--- a/doc/doxygen/Doxyfile
+++ b/doc/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = Blender
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = V3.3
+PROJECT_NUMBER         = V3.4

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -1131,6 +1131,7 @@ def pymodule2sphinx(basepath, module_name, module, title, module_all_extra):
 # Changes In Blender will force errors here.
 context_type_map = {
    # context_member: (RNA type, is_collection)
+    "active_action": ("Action", False),
    "active_annotation_layer": ("GPencilLayer", False),
    "active_bone": ("EditBone", False),
    "active_file": ("FileSelectEntry", False),
--- a/extern/gflags/CMakeLists.txt
+++ b/extern/gflags/CMakeLists.txt
@@ -1,6 +1,13 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
 # Copyright 2016 Blender Foundation. All rights reserved.

+# Too noisy for code we don't maintain.
+if(CMAKE_COMPILER_IS_GNUCC)
+  if(NOT "${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS "8.0")
+    add_cxx_flag("-Wno-cast-function-type")
+  endif()
+endif()
+
 set(INC
  src
  src/gflags
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -36,8 +36,13 @@ if(WITH_CYCLES_NATIVE_ONLY)
  )

  if(NOT MSVC)
-    string(APPEND CMAKE_CXX_FLAGS " -march=native")
-    set(CYCLES_KERNEL_FLAGS "-march=native")
+    ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_march_native "-march=native")
+    if(_has_march_native)
+      set(CYCLES_KERNEL_FLAGS "-march=native")
+    else()
+      set(CYCLES_KERNEL_FLAGS "")
+    endif()
+    unset(_has_march_native)
  else()
    if(NOT MSVC_NATIVE_ARCH_FLAGS)
        TRY_RUN(
--- a/intern/cycles/blender/curves.cpp
+++ b/intern/cycles/blender/curves.cpp
@@ -55,7 +55,7 @@ static bool ObtainCacheParticleData(
    return false;

  Transform tfm = get_transform(b_ob->matrix_world());
-  Transform itfm = transform_quick_inverse(tfm);
+  Transform itfm = transform_inverse(tfm);

  for (BL::Modifier &b_mod : b_ob->modifiers) {
    if ((b_mod.type() == b_mod.type_PARTICLE_SYSTEM) &&
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -928,8 +928,22 @@ static ShaderNode *add_node(Scene *scene,
    sky->set_sun_disc(b_sky_node.sun_disc());
    sky->set_sun_size(b_sky_node.sun_size());
    sky->set_sun_intensity(b_sky_node.sun_intensity());
-    sky->set_sun_elevation(b_sky_node.sun_elevation());
-    sky->set_sun_rotation(b_sky_node.sun_rotation());
+    /* Patch sun position to be able to animate daylight cycle while keeping the shading code
+     * simple. */
+    float sun_rotation = b_sky_node.sun_rotation();
+    /* Wrap into [-2PI..2PI] range. */
+    float sun_elevation = fmodf(b_sky_node.sun_elevation(), M_2PI_F);
+    /* Wrap into [-PI..PI] range. */
+    if (fabsf(sun_elevation) >= M_PI_F) {
+      sun_elevation -= copysignf(2.0f, sun_elevation) * M_PI_F;
+    }
+    /* Wrap into [-PI/2..PI/2] range while keeping the same absolute position. */
+    if (sun_elevation >= M_PI_2_F || sun_elevation <= -M_PI_2_F) {
+      sun_elevation = copysignf(M_PI_F, sun_elevation) - sun_elevation;
+      sun_rotation += M_PI_F;
+    }
+    sky->set_sun_elevation(sun_elevation);
+    sky->set_sun_rotation(sun_rotation);
    sky->set_altitude(b_sky_node.altitude());
    sky->set_air_density(b_sky_node.air_density());
    sky->set_dust_density(b_sky_node.dust_density());
--- a/intern/cycles/blender/sync.h
+++ b/intern/cycles/blender/sync.h
@@ -7,6 +7,7 @@
 #include "MEM_guardedalloc.h"
 #include "RNA_access.h"
 #include "RNA_blender_cpp.h"
+#include "RNA_path.h"
 #include "RNA_types.h"

 #include "blender/id_map.h"
--- a/intern/cycles/bvh/embree.cpp
+++ b/intern/cycles/bvh/embree.cpp
@@ -21,13 +21,9 @@

 #  include "bvh/embree.h"

-/* Kernel includes are necessary so that the filter function for Embree can access the packed BVH.
- */
-#  include "kernel/bvh/embree.h"
-#  include "kernel/bvh/util.h"
+#  include "kernel/device/cpu/bvh.h"
 #  include "kernel/device/cpu/compat.h"
 #  include "kernel/device/cpu/globals.h"
-#  include "kernel/sample/lcg.h"

 #  include "scene/hair.h"
 #  include "scene/mesh.h"
@@ -46,265 +42,6 @@ static_assert(Object::MAX_MOTION_STEPS <= RTC_MAX_TIME_STEP_COUNT,
 static_assert(Object::MAX_MOTION_STEPS == Geometry::MAX_MOTION_STEPS,
              "Object and Geometry max motion steps inconsistent");

-#  define IS_HAIR(x) (x & 1)
-
-/* This gets called by Embree at every valid ray/object intersection.
- * Things like recording subsurface or shadow hits for later evaluation
- * as well as filtering for volume objects happen here.
- * Cycles' own BVH does that directly inside the traversal calls.
- */
-static void rtc_filter_intersection_func(const RTCFilterFunctionNArguments *args)
-{
-  /* Current implementation in Cycles assumes only single-ray intersection queries. */
-  assert(args->N == 1);
-
-  RTCHit *hit = (RTCHit *)args->hit;
-  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
-  const KernelGlobalsCPU *kg = ctx->kg;
-  const Ray *cray = ctx->ray;
-
-  if (kernel_embree_is_self_intersection(kg, hit, cray)) {
-    *args->valid = 0;
-  }
-}
-
-/* This gets called by Embree at every valid ray/object intersection.
- * Things like recording subsurface or shadow hits for later evaluation
- * as well as filtering for volume objects happen here.
- * Cycles' own BVH does that directly inside the traversal calls.
- */
-static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
-{
-  /* Current implementation in Cycles assumes only single-ray intersection queries. */
-  assert(args->N == 1);
-
-  const RTCRay *ray = (RTCRay *)args->ray;
-  RTCHit *hit = (RTCHit *)args->hit;
-  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
-  const KernelGlobalsCPU *kg = ctx->kg;
-  const Ray *cray = ctx->ray;
-
-  switch (ctx->type) {
-    case CCLIntersectContext::RAY_SHADOW_ALL: {
-      Intersection current_isect;
-      kernel_embree_convert_hit(kg, ray, hit, &current_isect);
-      if (intersection_skip_self_shadow(cray->self, current_isect.object, current_isect.prim)) {
-        *args->valid = 0;
-        return;
-      }
-      /* If no transparent shadows or max number of hits exceeded, all light is blocked. */
-      const int flags = intersection_get_shader_flags(kg, current_isect.prim, current_isect.type);
-      if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->num_hits >= ctx->max_hits) {
-        ctx->opaque_hit = true;
-        return;
-      }
-
-      ++ctx->num_hits;
-
-      /* Always use baked shadow transparency for curves. */
-      if (current_isect.type & PRIMITIVE_CURVE) {
-        ctx->throughput *= intersection_curve_shadow_transparency(
-            kg, current_isect.object, current_isect.prim, current_isect.u);
-
-        if (ctx->throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
-          ctx->opaque_hit = true;
-          return;
-        }
-        else {
-          *args->valid = 0;
-          return;
-        }
-      }
-
-      /* Test if we need to record this transparent intersection. */
-      const uint max_record_hits = min(ctx->max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
-      if (ctx->num_recorded_hits < max_record_hits || ray->tfar < ctx->max_t) {
-        /* If maximum number of hits was reached, replace the intersection with the
-         * highest distance. We want to find the N closest intersections. */
-        const uint num_recorded_hits = min(ctx->num_recorded_hits, max_record_hits);
-        uint isect_index = num_recorded_hits;
-        if (num_recorded_hits + 1 >= max_record_hits) {
-          float max_t = ctx->isect_s[0].t;
-          uint max_recorded_hit = 0;
-
-          for (uint i = 1; i < num_recorded_hits; ++i) {
-            if (ctx->isect_s[i].t > max_t) {
-              max_recorded_hit = i;
-              max_t = ctx->isect_s[i].t;
-            }
-          }
-
-          if (num_recorded_hits >= max_record_hits) {
-            isect_index = max_recorded_hit;
-          }
-
-          /* Limit the ray distance and stop counting hits beyond this.
-           * TODO: is there some way we can tell Embree to stop intersecting beyond
-           * this distance when max number of hits is reached?. Or maybe it will
-           * become irrelevant if we make max_hits a very high number on the CPU. */
-          ctx->max_t = max(current_isect.t, max_t);
-        }
-
-        ctx->isect_s[isect_index] = current_isect;
-      }
-
-      /* Always increase the number of recorded hits, even beyond the maximum,
-       * so that we can detect this and trace another ray if needed. */
-      ++ctx->num_recorded_hits;
-
-      /* This tells Embree to continue tracing. */
-      *args->valid = 0;
-      break;
-    }
-    case CCLIntersectContext::RAY_LOCAL:
-    case CCLIntersectContext::RAY_SSS: {
-      /* Check if it's hitting the correct object. */
-      Intersection current_isect;
-      if (ctx->type == CCLIntersectContext::RAY_SSS) {
-        kernel_embree_convert_sss_hit(kg, ray, hit, &current_isect, ctx->local_object_id);
-      }
-      else {
-        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
-        if (ctx->local_object_id != current_isect.object) {
-          /* This tells Embree to continue tracing. */
-          *args->valid = 0;
-          break;
-        }
-      }
-      if (intersection_skip_self_local(cray->self, current_isect.prim)) {
-        *args->valid = 0;
-        return;
-      }
-
-      /* No intersection information requested, just return a hit. */
-      if (ctx->max_hits == 0) {
-        break;
-      }
-
-      /* Ignore curves. */
-      if (IS_HAIR(hit->geomID)) {
-        /* This tells Embree to continue tracing. */
-        *args->valid = 0;
-        break;
-      }
-
-      LocalIntersection *local_isect = ctx->local_isect;
-      int hit_idx = 0;
-
-      if (ctx->lcg_state) {
-        /* See triangle_intersect_subsurface() for the native equivalent. */
-        for (int i = min((int)ctx->max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
-          if (local_isect->hits[i].t == ray->tfar) {
-            /* This tells Embree to continue tracing. */
-            *args->valid = 0;
-            return;
-          }
-        }
-
-        local_isect->num_hits++;
-
-        if (local_isect->num_hits <= ctx->max_hits) {
-          hit_idx = local_isect->num_hits - 1;
-        }
-        else {
-          /* reservoir sampling: if we are at the maximum number of
-           * hits, randomly replace element or skip it */
-          hit_idx = lcg_step_uint(ctx->lcg_state) % local_isect->num_hits;
-
-          if (hit_idx >= ctx->max_hits) {
-            /* This tells Embree to continue tracing. */
-            *args->valid = 0;
-            return;
-          }
-        }
-      }
-      else {
-        /* Record closest intersection only. */
-        if (local_isect->num_hits && current_isect.t > local_isect->hits[0].t) {
-          *args->valid = 0;
-          return;
-        }
-
-        local_isect->num_hits = 1;
-      }
-
-      /* record intersection */
-      local_isect->hits[hit_idx] = current_isect;
-      local_isect->Ng[hit_idx] = normalize(make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z));
-      /* This tells Embree to continue tracing. */
-      *args->valid = 0;
-      break;
-    }
-    case CCLIntersectContext::RAY_VOLUME_ALL: {
-      /* Append the intersection to the end of the array. */
-      if (ctx->num_hits < ctx->max_hits) {
-        Intersection current_isect;
-        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
-        if (intersection_skip_self(cray->self, current_isect.object, current_isect.prim)) {
-          *args->valid = 0;
-          return;
-        }
-
-        Intersection *isect = &ctx->isect_s[ctx->num_hits];
-        ++ctx->num_hits;
-        *isect = current_isect;
-        /* Only primitives from volume object. */
-        uint tri_object = isect->object;
-        int object_flag = kernel_data_fetch(object_flag, tri_object);
-        if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
-          --ctx->num_hits;
-        }
-        /* This tells Embree to continue tracing. */
-        *args->valid = 0;
-      }
-      break;
-    }
-    case CCLIntersectContext::RAY_REGULAR:
-    default:
-      if (kernel_embree_is_self_intersection(kg, hit, cray)) {
-        *args->valid = 0;
-        return;
-      }
-      break;
-  }
-}
-
-static void rtc_filter_func_backface_cull(const RTCFilterFunctionNArguments *args)
-{
-  const RTCRay *ray = (RTCRay *)args->ray;
-  RTCHit *hit = (RTCHit *)args->hit;
-
-  /* Always ignore back-facing intersections. */
-  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
-          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
-    *args->valid = 0;
-    return;
-  }
-
-  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
-  const KernelGlobalsCPU *kg = ctx->kg;
-  const Ray *cray = ctx->ray;
-
-  if (kernel_embree_is_self_intersection(kg, hit, cray)) {
-    *args->valid = 0;
-  }
-}
-
-static void rtc_filter_occluded_func_backface_cull(const RTCFilterFunctionNArguments *args)
-{
-  const RTCRay *ray = (RTCRay *)args->ray;
-  RTCHit *hit = (RTCHit *)args->hit;
-
-  /* Always ignore back-facing intersections. */
-  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
-          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
-    *args->valid = 0;
-    return;
-  }
-
-  rtc_filter_occluded_func(args);
-}
-
 static size_t unaccounted_mem = 0;

 static bool rtc_memory_monitor_func(void *userPtr, const ssize_t bytes, const bool)
@@ -535,8 +272,8 @@ void BVHEmbree::add_triangles(const Object *ob, const Mesh *mesh, int i)
  set_tri_vertex_buffer(geom_id, mesh, false);

  rtcSetGeometryUserData(geom_id, (void *)prim_offset);
-  rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
-  rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_intersection_func);
+  rtcSetGeometryOccludedFilterFunction(geom_id, kernel_embree_filter_occluded_func);
+  rtcSetGeometryIntersectFilterFunction(geom_id, kernel_embree_filter_intersection_func);
  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());

  rtcCommitGeometry(geom_id);
@@ -739,8 +476,8 @@ void BVHEmbree::add_points(const Object *ob, const PointCloud *pointcloud, int i
  set_point_vertex_buffer(geom_id, pointcloud, false);

  rtcSetGeometryUserData(geom_id, (void *)prim_offset);
-  rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func_backface_cull);
-  rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func_backface_cull);
+  rtcSetGeometryIntersectFilterFunction(geom_id, kernel_embree_filter_func_backface_cull);
+  rtcSetGeometryOccludedFilterFunction(geom_id, kernel_embree_filter_occluded_func_backface_cull);
  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());

  rtcCommitGeometry(geom_id);
@@ -799,12 +536,13 @@ void BVHEmbree::add_curves(const Object *ob, const Hair *hair, int i)

  rtcSetGeometryUserData(geom_id, (void *)prim_offset);
  if (hair->curve_shape == CURVE_RIBBON) {
-    rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_intersection_func);
-    rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func);
+    rtcSetGeometryIntersectFilterFunction(geom_id, kernel_embree_filter_intersection_func);
+    rtcSetGeometryOccludedFilterFunction(geom_id, kernel_embree_filter_occluded_func);
  }
  else {
-    rtcSetGeometryIntersectFilterFunction(geom_id, rtc_filter_func_backface_cull);
-    rtcSetGeometryOccludedFilterFunction(geom_id, rtc_filter_occluded_func_backface_cull);
+    rtcSetGeometryIntersectFilterFunction(geom_id, kernel_embree_filter_func_backface_cull);
+    rtcSetGeometryOccludedFilterFunction(geom_id,
+                                         kernel_embree_filter_occluded_func_backface_cull);
  }
  rtcSetGeometryMask(geom_id, ob->visibility_for_tracing());

--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -402,6 +402,18 @@ unique_ptr<DeviceQueue> OneapiDevice::gpu_queue_create()
  return make_unique<OneapiDeviceQueue>(this);
 }

+int OneapiDevice::get_num_multiprocessors()
+{
+  assert(device_queue_);
+  return oneapi_dll_.oneapi_get_num_multiprocessors(device_queue_);
+}
+
+int OneapiDevice::get_max_num_threads_per_multiprocessor()
+{
+  assert(device_queue_);
+  return oneapi_dll_.oneapi_get_max_num_threads_per_multiprocessor(device_queue_);
+}
+
 bool OneapiDevice::should_use_graphics_interop()
 {
  /* NOTE(@nsirgien): oneAPI doesn't yet support direct writing into graphics API objects, so
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -89,6 +89,9 @@ class OneapiDevice : public Device {

  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;

+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
  /* NOTE(@nsirgien): Create this methods to avoid some compilation problems on Windows with host
   * side compilation (MSVC). */
  void *usm_aligned_alloc_host(size_t memory_size, size_t alignment);
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@@ -36,34 +36,9 @@ OneapiDeviceQueue::~OneapiDeviceQueue()

 int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const
 {
-  int num_states;
-
-  /* TODO: implement and use get_num_multiprocessors and get_max_num_threads_per_multiprocessor. */
-  const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
-      oneapi_device_->sycl_queue());
-  if (compute_units >= 128) {
-    /* dGPU path, make sense to allocate more states, because it will be dedicated GPU memory. */
-    int base = 1024 * 1024;
-    /* linear dependency (with coefficient less that 1) from amount of compute units. */
-    num_states = (base * (compute_units / 128)) * 3 / 4;
-
-    /* Limit amount of integrator states by one quarter of device memory, because
-     * other allocations will need some space as well
-     * TODO: base this calculation on the how many states what the GPU is actually capable of
-     * running, with some headroom to improve occupancy. If the texture don't fit, offload into
-     * unified memory. */
-    size_t states_memory_size = num_states * state_size;
-    size_t device_memory_amount =
-        (oneapi_dll_.oneapi_get_memcapacity)(oneapi_device_->sycl_queue());
-    if (states_memory_size >= device_memory_amount / 4) {
-      num_states = device_memory_amount / 4 / state_size;
-    }
-  }
-  else {
-    /* iGPU path - no real need to allocate a lot of integrator states because it is shared GPU
-     * memory. */
-    num_states = 1024 * 512;
-  }
+  const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
+                              oneapi_device_->get_max_num_threads_per_multiprocessor();
+  int num_states = max(8 * max_num_threads, 65536) * 16;

  VLOG_DEVICE_STATS << "GPU queue concurrent states: " << num_states << ", using up to "
                    << string_human_readable_size(num_states * state_size);
@@ -73,14 +48,10 @@ int OneapiDeviceQueue::num_concurrent_states(const size_t state_size) const

 int OneapiDeviceQueue::num_concurrent_busy_states() const
 {
-  const size_t compute_units = oneapi_dll_.oneapi_get_compute_units_amount(
-      oneapi_device_->sycl_queue());
-  if (compute_units >= 128) {
-    return 1024 * 1024;
-  }
-  else {
-    return 1024 * 512;
-  }
+  const int max_num_threads = oneapi_device_->get_num_multiprocessors() *
+                              oneapi_device_->get_max_num_threads_per_multiprocessor();
+
+  return 4 * max(8 * max_num_threads, 65536);
 }

 void OneapiDeviceQueue::init_execution()
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -26,7 +26,6 @@
 #  include "util/task.h"
 #  include "util/time.h"

-#  undef __KERNEL_CPU__
 #  define __KERNEL_OPTIX__
 #  include "kernel/device/optix/globals.h"

--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -8,7 +8,6 @@

 #  include "util/time.h"

-#  undef __KERNEL_CPU__
 #  define __KERNEL_OPTIX__
 #  include "kernel/device/optix/globals.h"

--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -42,6 +42,7 @@ set(SRC_KERNEL_DEVICE_ONEAPI
 )

 set(SRC_KERNEL_DEVICE_CPU_HEADERS
+  device/cpu/bvh.h
  device/cpu/compat.h
  device/cpu/image.h
  device/cpu/globals.h
@@ -71,11 +72,13 @@ set(SRC_KERNEL_DEVICE_HIP_HEADERS
 )

 set(SRC_KERNEL_DEVICE_OPTIX_HEADERS
+  device/optix/bvh.h
  device/optix/compat.h
  device/optix/globals.h
 )

 set(SRC_KERNEL_DEVICE_METAL_HEADERS
+  device/metal/bvh.h
  device/metal/compat.h
  device/metal/context_begin.h
  device/metal/context_end.h
@@ -214,8 +217,6 @@ set(SRC_KERNEL_BVH_HEADERS
  bvh/util.h
  bvh/volume.h
  bvh/volume_all.h
-  bvh/embree.h
-  bvh/metal.h
 )

 set(SRC_KERNEL_CAMERA_HEADERS
@@ -316,6 +317,7 @@ set(SRC_UTIL_HEADERS
  ../util/math_float2.h
  ../util/math_float3.h
  ../util/math_float4.h
+  ../util/math_float8.h
  ../util/math_int2.h
  ../util/math_int3.h
  ../util/math_int4.h
@@ -353,8 +355,6 @@ set(SRC_UTIL_HEADERS
  ../util/types_uint4.h
  ../util/types_uint4_impl.h
  ../util/types_ushort4.h
-  ../util/types_vector3.h
-  ../util/types_vector3_impl.h
 )

 set(LIB
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -1,40 +1,47 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-/* BVH
- *
- * Bounding volume hierarchy for ray tracing. We compile different variations
- * of the same BVH traversal function for faster rendering when some types of
- * primitives are not needed, using #includes to work around the lack of
- * C++ templates in OpenCL.
- *
- * Originally based on "Understanding the Efficiency of Ray Traversal on GPUs",
- * the code has been extended and modified to support more primitives and work
- * with CPU/CUDA/OpenCL. */
-
 #pragma once

-#ifdef __EMBREE__
-#  include "kernel/bvh/embree.h"
-#endif
-
-#ifdef __METALRT__
-#  include "kernel/bvh/metal.h"
-#endif
-
 #include "kernel/bvh/types.h"
 #include "kernel/bvh/util.h"

 #include "kernel/integrator/state_util.h"

+/* Device specific acceleration structures for ray tracing. */
+
+#if defined(__EMBREE__)
+#  include "kernel/device/cpu/bvh.h"
+#  define __BVH2__
+#elif defined(__METALRT__)
+#  include "kernel/device/metal/bvh.h"
+#elif defined(__KERNEL_OPTIX__)
+#  include "kernel/device/optix/bvh.h"
+#else
+#  define __BVH2__
+#endif
+
 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU_RAYTRACING__)
+#ifdef __BVH2__

-/* Regular BVH traversal */
+/* BVH2
+ *
+ * Bounding volume hierarchy for ray tracing, when no native acceleration
+ * structure is available for the device.
+
+ * We compile different variations of the same BVH traversal function for
+ * faster rendering when some types of primitives are not needed, using #includes
+ * to work around the lack of C++ templates in OpenCL.
+ *
+ * Originally based on "Understanding the Efficiency of Ray Traversal on GPUs",
+ * the code has been extended and modified to support more primitives and work
+ * with CPU and various GPU kernel languages. */

 #  include "kernel/bvh/nodes.h"

+/* Regular BVH traversal */
+
 #  define BVH_FUNCTION_NAME bvh_intersect
 #  define BVH_FUNCTION_FEATURES BVH_POINTCLOUD
 #  include "kernel/bvh/traversal.h"
@@ -57,9 +64,46 @@ CCL_NAMESPACE_BEGIN
 #    include "kernel/bvh/traversal.h"
 #  endif

-/* Subsurface scattering BVH traversal */
+ccl_device_intersect bool scene_intersect(KernelGlobals kg,
+                                          ccl_private const Ray *ray,
+                                          const uint visibility,
+                                          ccl_private Intersection *isect)
+{
+  if (!intersection_ray_valid(ray)) {
+    return false;
+  }
+
+#  ifdef __EMBREE__
+  if (kernel_data.device_bvh) {
+    return kernel_embree_intersect(kg, ray, visibility, isect);
+  }
+#  endif
+
+#  ifdef __OBJECT_MOTION__
+  if (kernel_data.bvh.have_motion) {
+#    ifdef __HAIR__
+    if (kernel_data.bvh.have_curves) {
+      return bvh_intersect_hair_motion(kg, ray, isect, visibility);
+    }
+#    endif /* __HAIR__ */
+
+    return bvh_intersect_motion(kg, ray, isect, visibility);
+  }
+#  endif /* __OBJECT_MOTION__ */
+
+#  ifdef __HAIR__
+  if (kernel_data.bvh.have_curves) {
+    return bvh_intersect_hair(kg, ray, isect, visibility);
+  }
+#  endif /* __HAIR__ */
+
+  return bvh_intersect(kg, ray, isect, visibility);
+}
+
+/* Single object BVH traversal, for SSS/AO/bevel. */
+
+#  ifdef __BVH_LOCAL__

-#  if defined(__BVH_LOCAL__)
 #    define BVH_FUNCTION_NAME bvh_intersect_local
 #    define BVH_FUNCTION_FEATURES BVH_HAIR
 #    include "kernel/bvh/local.h"
@@ -69,25 +113,40 @@ CCL_NAMESPACE_BEGIN
 #      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
 #      include "kernel/bvh/local.h"
 #    endif
-#  endif /* __BVH_LOCAL__ */

-/* Volume BVH traversal */
+ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
+                                                ccl_private const Ray *ray,
+                                                ccl_private LocalIntersection *local_isect,
+                                                int local_object,
+                                                ccl_private uint *lcg_state,
+                                                int max_hits)
+{
+  if (!intersection_ray_valid(ray)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    return false;
+  }

-#  if defined(__VOLUME__)
-#    define BVH_FUNCTION_NAME bvh_intersect_volume
-#    define BVH_FUNCTION_FEATURES BVH_HAIR
-#    include "kernel/bvh/volume.h"
-
-#    if defined(__OBJECT_MOTION__)
-#      define BVH_FUNCTION_NAME bvh_intersect_volume_motion
-#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
-#      include "kernel/bvh/volume.h"
+#    ifdef __EMBREE__
+  if (kernel_data.device_bvh) {
+    return kernel_embree_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
+  }
 #    endif
-#  endif /* __VOLUME__ */

-/* Record all intersections - Shadow BVH traversal */
+#    ifdef __OBJECT_MOTION__
+  if (kernel_data.bvh.have_motion) {
+    return bvh_intersect_local_motion(kg, ray, local_isect, local_object, lcg_state, max_hits);
+  }
+#    endif /* __OBJECT_MOTION__ */
+  return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
+}
+#  endif
+
+/* Transparent shadow BVH traversal, recording multiple intersections. */
+
+#  ifdef __SHADOW_RECORD_ALL__

-#  if defined(__SHADOW_RECORD_ALL__)
 #    define BVH_FUNCTION_NAME bvh_intersect_shadow_all
 #    define BVH_FUNCTION_FEATURES BVH_POINTCLOUD
 #    include "kernel/bvh/shadow_all.h"
@@ -110,412 +169,6 @@ CCL_NAMESPACE_BEGIN
 #      include "kernel/bvh/shadow_all.h"
 #    endif

-#  endif /* __SHADOW_RECORD_ALL__ */
-
-/* Record all intersections - Volume BVH traversal. */
-
-#  if defined(__VOLUME_RECORD_ALL__)
-#    define BVH_FUNCTION_NAME bvh_intersect_volume_all
-#    define BVH_FUNCTION_FEATURES BVH_HAIR
-#    include "kernel/bvh/volume_all.h"
-
-#    if defined(__OBJECT_MOTION__)
-#      define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
-#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
-#      include "kernel/bvh/volume_all.h"
-#    endif
-#  endif /* __VOLUME_RECORD_ALL__ */
-
-#  undef BVH_FEATURE
-#  undef BVH_NAME_JOIN
-#  undef BVH_NAME_EVAL
-#  undef BVH_FUNCTION_FULL_NAME
-
-#endif /* !defined(__KERNEL_GPU_RAYTRACING__) */
-
-ccl_device_inline bool scene_intersect_valid(ccl_private const Ray *ray)
-{
-  /* NOTE: Due to some vectorization code  non-finite origin point might
-   * cause lots of false-positive intersections which will overflow traversal
-   * stack.
-   * This code is a quick way to perform early output, to avoid crashes in
-   * such cases.
-   * From production scenes so far it seems it's enough to test first element
-   * only.
-   * Scene intersection may also called with empty rays for conditional trace
-   * calls that evaluate to false, so filter those out.
-   */
-  return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
-}
-
-ccl_device_intersect bool scene_intersect(KernelGlobals kg,
-                                          ccl_private const Ray *ray,
-                                          const uint visibility,
-                                          ccl_private Intersection *isect)
-{
-#ifdef __KERNEL_OPTIX__
-  uint p0 = 0;
-  uint p1 = 0;
-  uint p2 = 0;
-  uint p3 = 0;
-  uint p4 = visibility;
-  uint p5 = PRIMITIVE_NONE;
-  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
-  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
-
-  uint ray_mask = visibility & 0xFF;
-  uint ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-  }
-  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    ray_flags |= OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
-  }
-
-  optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0,
-             ray->P,
-             ray->D,
-             ray->tmin,
-             ray->tmax,
-             ray->time,
-             ray_mask,
-             ray_flags,
-             0, /* SBT offset for PG_HITD */
-             0,
-             0,
-             p0,
-             p1,
-             p2,
-             p3,
-             p4,
-             p5,
-             p6,
-             p7);
-
-  isect->t = __uint_as_float(p0);
-  isect->u = __uint_as_float(p1);
-  isect->v = __uint_as_float(p2);
-  isect->prim = p3;
-  isect->object = p4;
-  isect->type = p5;
-
-  return p5 != PRIMITIVE_NONE;
-#elif defined(__METALRT__)
-
-  if (!scene_intersect_valid(ray)) {
-    isect->t = ray->tmax;
-    isect->type = PRIMITIVE_NONE;
-    return false;
-  }
-
-#  if defined(__KERNEL_DEBUG__)
-  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
-    isect->t = ray->tmax;
-    isect->type = PRIMITIVE_NONE;
-    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
-    return false;
-  }
-
-  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
-    isect->t = ray->tmax;
-    isect->type = PRIMITIVE_NONE;
-    kernel_assert(!"Invalid ift_default");
-    return false;
-  }
-#  endif
-
-  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
-  metalrt_intersector_type metalrt_intersect;
-
-  if (!kernel_data.bvh.have_curves) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
-  }
-
-  MetalRTIntersectionPayload payload;
-  payload.self = ray->self;
-  payload.u = 0.0f;
-  payload.v = 0.0f;
-  payload.visibility = visibility;
-
-  typename metalrt_intersector_type::result_type intersection;
-
-  uint ray_mask = visibility & 0xFF;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-    /* No further intersector setup required: Default MetalRT behavior is any-hit. */
-  }
-  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    /* No further intersector setup required: Shadow ray early termination is controlled by the
-     * intersection handler */
-  }
-
-#  if defined(__METALRT_MOTION__)
-  payload.time = ray->time;
-  intersection = metalrt_intersect.intersect(r,
-                                             metal_ancillaries->accel_struct,
-                                             ray_mask,
-                                             ray->time,
-                                             metal_ancillaries->ift_default,
-                                             payload);
-#  else
-  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload);
-#  endif
-
-  if (intersection.type == intersection_type::none) {
-    isect->t = ray->tmax;
-    isect->type = PRIMITIVE_NONE;
-
-    return false;
-  }
-
-  isect->t = intersection.distance;
-
-  isect->prim = payload.prim;
-  isect->type = payload.type;
-  isect->object = intersection.user_instance_id;
-
-  isect->t = intersection.distance;
-  if (intersection.type == intersection_type::triangle) {
-    isect->u = 1.0f - intersection.triangle_barycentric_coord.y -
-               intersection.triangle_barycentric_coord.x;
-    isect->v = intersection.triangle_barycentric_coord.x;
-  }
-  else {
-    isect->u = payload.u;
-    isect->v = payload.v;
-  }
-
-  return isect->type != PRIMITIVE_NONE;
-
-#else
-
-  if (!scene_intersect_valid(ray)) {
-    return false;
-  }
-
-#  ifdef __EMBREE__
-  if (kernel_data.device_bvh) {
-    isect->t = ray->tmax;
-    CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
-    IntersectContext rtc_ctx(&ctx);
-    RTCRayHit ray_hit;
-    ctx.ray = ray;
-    kernel_embree_setup_rayhit(*ray, ray_hit, visibility);
-    rtcIntersect1(kernel_data.device_bvh, &rtc_ctx.context, &ray_hit);
-    if (ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID &&
-        ray_hit.hit.primID != RTC_INVALID_GEOMETRY_ID) {
-      kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect);
-      return true;
-    }
-    return false;
-  }
-#  endif /* __EMBREE__ */
-
-#  ifdef __OBJECT_MOTION__
-  if (kernel_data.bvh.have_motion) {
-#    ifdef __HAIR__
-    if (kernel_data.bvh.have_curves) {
-      return bvh_intersect_hair_motion(kg, ray, isect, visibility);
-    }
-#    endif /* __HAIR__ */
-
-    return bvh_intersect_motion(kg, ray, isect, visibility);
-  }
-#  endif   /* __OBJECT_MOTION__ */
-
-#  ifdef __HAIR__
-  if (kernel_data.bvh.have_curves) {
-    return bvh_intersect_hair(kg, ray, isect, visibility);
-  }
-#  endif /* __HAIR__ */
-
-  return bvh_intersect(kg, ray, isect, visibility);
-#endif   /* __KERNEL_OPTIX__ */
-}
-
-#ifdef __BVH_LOCAL__
-ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
-                                                ccl_private const Ray *ray,
-                                                ccl_private LocalIntersection *local_isect,
-                                                int local_object,
-                                                ccl_private uint *lcg_state,
-                                                int max_hits)
-{
-#  ifdef __KERNEL_OPTIX__
-  uint p0 = pointer_pack_to_uint_0(lcg_state);
-  uint p1 = pointer_pack_to_uint_1(lcg_state);
-  uint p2 = pointer_pack_to_uint_0(local_isect);
-  uint p3 = pointer_pack_to_uint_1(local_isect);
-  uint p4 = local_object;
-  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
-  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
-
-  /* Is set to zero on miss or if ray is aborted, so can be used as return value. */
-  uint p5 = max_hits;
-
-  if (local_isect) {
-    local_isect->num_hits = 0; /* Initialize hit count to zero. */
-  }
-  optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0,
-             ray->P,
-             ray->D,
-             ray->tmin,
-             ray->tmax,
-             ray->time,
-             0xFF,
-             /* Need to always call into __anyhit__kernel_optix_local_hit. */
-             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             2, /* SBT offset for PG_HITL */
-             0,
-             0,
-             p0,
-             p1,
-             p2,
-             p3,
-             p4,
-             p5,
-             p6,
-             p7);
-
-  return p5;
-#  elif defined(__METALRT__)
-  if (!scene_intersect_valid(ray)) {
-    if (local_isect) {
-      local_isect->num_hits = 0;
-    }
-    return false;
-  }
-
-#    if defined(__KERNEL_DEBUG__)
-  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
-    if (local_isect) {
-      local_isect->num_hits = 0;
-    }
-    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
-    return false;
-  }
-
-  if (is_null_intersection_function_table(metal_ancillaries->ift_local)) {
-    if (local_isect) {
-      local_isect->num_hits = 0;
-    }
-    kernel_assert(!"Invalid ift_local");
-    return false;
-  }
-#    endif
-
-  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
-  metalrt_intersector_type metalrt_intersect;
-
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-  if (!kernel_data.bvh.have_curves) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
-  }
-
-  MetalRTIntersectionLocalPayload payload;
-  payload.self = ray->self;
-  payload.local_object = local_object;
-  payload.max_hits = max_hits;
-  payload.local_isect.num_hits = 0;
-  if (lcg_state) {
-    payload.has_lcg_state = true;
-    payload.lcg_state = *lcg_state;
-  }
-  payload.result = false;
-
-  typename metalrt_intersector_type::result_type intersection;
-
-#    if defined(__METALRT_MOTION__)
-  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
-#    else
-  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
-#    endif
-
-  if (lcg_state) {
-    *lcg_state = payload.lcg_state;
-  }
-  *local_isect = payload.local_isect;
-
-  return payload.result;
-
-#  else
-
-  if (!scene_intersect_valid(ray)) {
-    if (local_isect) {
-      local_isect->num_hits = 0;
-    }
-    return false;
-  }
-
-#    ifdef __EMBREE__
-  if (kernel_data.device_bvh) {
-    const bool has_bvh = !(kernel_data_fetch(object_flag, local_object) &
-                           SD_OBJECT_TRANSFORM_APPLIED);
-    CCLIntersectContext ctx(
-        kg, has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL);
-    ctx.lcg_state = lcg_state;
-    ctx.max_hits = max_hits;
-    ctx.ray = ray;
-    ctx.local_isect = local_isect;
-    if (local_isect) {
-      local_isect->num_hits = 0;
-    }
-    ctx.local_object_id = local_object;
-    IntersectContext rtc_ctx(&ctx);
-    RTCRay rtc_ray;
-    kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_ALL_VISIBILITY);
-
-    /* If this object has its own BVH, use it. */
-    if (has_bvh) {
-      RTCGeometry geom = rtcGetGeometry(kernel_data.device_bvh, local_object * 2);
-      if (geom) {
-        float3 P = ray->P;
-        float3 dir = ray->D;
-        float3 idir = ray->D;
-        Transform ob_itfm;
-        rtc_ray.tfar = ray->tmax *
-                       bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
-        /* bvh_instance_motion_push() returns the inverse transform but
-         * it's not needed here. */
-        (void)ob_itfm;
-
-        rtc_ray.org_x = P.x;
-        rtc_ray.org_y = P.y;
-        rtc_ray.org_z = P.z;
-        rtc_ray.dir_x = dir.x;
-        rtc_ray.dir_y = dir.y;
-        rtc_ray.dir_z = dir.z;
-        RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom);
-        kernel_assert(scene);
-        if (scene) {
-          rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray);
-        }
-      }
-    }
-    else {
-      rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
-    }
-
-    /* rtcOccluded1 sets tfar to -inf if a hit was found. */
-    return (local_isect && local_isect->num_hits > 0) || (rtc_ray.tfar < 0);
-    ;
-  }
-#    endif /* __EMBREE__ */
-
-#    ifdef __OBJECT_MOTION__
-  if (kernel_data.bvh.have_motion) {
-    return bvh_intersect_local_motion(kg, ray, local_isect, local_object, lcg_state, max_hits);
-  }
-#    endif /* __OBJECT_MOTION__ */
-  return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
-#  endif   /* __KERNEL_OPTIX__ */
-}
-#endif
-
-#ifdef __SHADOW_RECORD_ALL__
 ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
                                                     IntegratorShadowState state,
                                                     ccl_private const Ray *ray,
@@ -524,109 +177,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
                                                     ccl_private uint *num_recorded_hits,
                                                     ccl_private float *throughput)
 {
-#  ifdef __KERNEL_OPTIX__
-  uint p0 = state;
-  uint p1 = __float_as_uint(1.0f); /* Throughput. */
-  uint p2 = 0;                     /* Number of hits. */
-  uint p3 = max_hits;
-  uint p4 = visibility;
-  uint p5 = false;
-  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
-  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
-
-  uint ray_mask = visibility & 0xFF;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-  }
-
-  optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0,
-             ray->P,
-             ray->D,
-             ray->tmin,
-             ray->tmax,
-             ray->time,
-             ray_mask,
-             /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */
-             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             1, /* SBT offset for PG_HITS */
-             0,
-             0,
-             p0,
-             p1,
-             p2,
-             p3,
-             p4,
-             p5,
-             p6,
-             p7);
-
-  *num_recorded_hits = uint16_unpack_from_uint_0(p2);
-  *throughput = __uint_as_float(p1);
-
-  return p5;
-#  elif defined(__METALRT__)
-
-  if (!scene_intersect_valid(ray)) {
-    return false;
-  }
-
-#    if defined(__KERNEL_DEBUG__)
-  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
-    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
-    return false;
-  }
-
-  if (is_null_intersection_function_table(metal_ancillaries->ift_shadow)) {
-    kernel_assert(!"Invalid ift_shadow");
-    return false;
-  }
-#    endif
-
-  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
-  metalrt_intersector_type metalrt_intersect;
-
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-  if (!kernel_data.bvh.have_curves) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
-  }
-
-  MetalRTIntersectionShadowPayload payload;
-  payload.self = ray->self;
-  payload.visibility = visibility;
-  payload.max_hits = max_hits;
-  payload.num_hits = 0;
-  payload.num_recorded_hits = 0;
-  payload.throughput = 1.0f;
-  payload.result = false;
-  payload.state = state;
-
-  uint ray_mask = visibility & 0xFF;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-  }
-
-  typename metalrt_intersector_type::result_type intersection;
-
-#    if defined(__METALRT_MOTION__)
-  payload.time = ray->time;
-  intersection = metalrt_intersect.intersect(r,
-                                             metal_ancillaries->accel_struct,
-                                             ray_mask,
-                                             ray->time,
-                                             metal_ancillaries->ift_shadow,
-                                             payload);
-#    else
-  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_shadow, payload);
-#    endif
-
-  *num_recorded_hits = payload.num_recorded_hits;
-  *throughput = payload.throughput;
-
-  return payload.result;
-
-#  else
-  if (!scene_intersect_valid(ray)) {
+  if (!intersection_ray_valid(ray)) {
    *num_recorded_hits = 0;
    *throughput = 1.0f;
    return false;
@@ -634,21 +185,10 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,

 #    ifdef __EMBREE__
  if (kernel_data.device_bvh) {
-    CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
-    Intersection *isect_array = (Intersection *)state->shadow_isect;
-    ctx.isect_s = isect_array;
-    ctx.max_hits = max_hits;
-    ctx.ray = ray;
-    IntersectContext rtc_ctx(&ctx);
-    RTCRay rtc_ray;
-    kernel_embree_setup_ray(*ray, rtc_ray, visibility);
-    rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
-
-    *num_recorded_hits = ctx.num_recorded_hits;
-    *throughput = ctx.throughput;
-    return ctx.opaque_hit;
+    return kernel_embree_intersect_shadow_all(
+        kg, state, ray, visibility, max_hits, num_recorded_hits, throughput);
  }
-#    endif /* __EMBREE__ */
+#    endif

 #    ifdef __OBJECT_MOTION__
  if (kernel_data.bvh.have_motion) {
@@ -662,7 +202,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
    return bvh_intersect_shadow_all_motion(
        kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
  }
-#    endif   /* __OBJECT_MOTION__ */
+#    endif /* __OBJECT_MOTION__ */

 #    ifdef __HAIR__
  if (kernel_data.bvh.have_curves) {
@@ -673,132 +213,29 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,

  return bvh_intersect_shadow_all(
      kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
-#  endif   /* __KERNEL_OPTIX__ */
 }
-#endif /* __SHADOW_RECORD_ALL__ */
+#  endif /* __SHADOW_RECORD_ALL__ */
+
+/* Volume BVH traversal, for initializing or updating the volume stack. */
+
+#  if defined(__VOLUME__) && !defined(__VOLUME_RECORD_ALL__)
+
+#    define BVH_FUNCTION_NAME bvh_intersect_volume
+#    define BVH_FUNCTION_FEATURES BVH_HAIR
+#    include "kernel/bvh/volume.h"
+
+#    if defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_volume_motion
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
+#      include "kernel/bvh/volume.h"
+#    endif

-#ifdef __VOLUME__
 ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
                                                 ccl_private const Ray *ray,
                                                 ccl_private Intersection *isect,
                                                 const uint visibility)
 {
-#  ifdef __KERNEL_OPTIX__
-  uint p0 = 0;
-  uint p1 = 0;
-  uint p2 = 0;
-  uint p3 = 0;
-  uint p4 = visibility;
-  uint p5 = PRIMITIVE_NONE;
-  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
-  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
-
-  uint ray_mask = visibility & 0xFF;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-  }
-
-  optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0,
-             ray->P,
-             ray->D,
-             ray->tmin,
-             ray->tmax,
-             ray->time,
-             ray_mask,
-             /* Need to always call into __anyhit__kernel_optix_volume_test. */
-             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             3, /* SBT offset for PG_HITV */
-             0,
-             0,
-             p0,
-             p1,
-             p2,
-             p3,
-             p4,
-             p5,
-             p6,
-             p7);
-
-  isect->t = __uint_as_float(p0);
-  isect->u = __uint_as_float(p1);
-  isect->v = __uint_as_float(p2);
-  isect->prim = p3;
-  isect->object = p4;
-  isect->type = p5;
-
-  return p5 != PRIMITIVE_NONE;
-#  elif defined(__METALRT__)
-
-  if (!scene_intersect_valid(ray)) {
-    return false;
-  }
-#    if defined(__KERNEL_DEBUG__)
-  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
-    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
-    return false;
-  }
-
-  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
-    kernel_assert(!"Invalid ift_default");
-    return false;
-  }
-#    endif
-
-  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
-  metalrt_intersector_type metalrt_intersect;
-
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-  if (!kernel_data.bvh.have_curves) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
-  }
-
-  MetalRTIntersectionPayload payload;
-  payload.self = ray->self;
-  payload.visibility = visibility;
-
-  typename metalrt_intersector_type::result_type intersection;
-
-  uint ray_mask = visibility & 0xFF;
-  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
-    ray_mask = 0xFF;
-  }
-
-#    if defined(__METALRT_MOTION__)
-  payload.time = ray->time;
-  intersection = metalrt_intersect.intersect(r,
-                                             metal_ancillaries->accel_struct,
-                                             ray_mask,
-                                             ray->time,
-                                             metal_ancillaries->ift_default,
-                                             payload);
-#    else
-  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload);
-#    endif
-
-  if (intersection.type == intersection_type::none) {
-    return false;
-  }
-
-  isect->prim = payload.prim;
-  isect->type = payload.type;
-  isect->object = intersection.user_instance_id;
-
-  isect->t = intersection.distance;
-  if (intersection.type == intersection_type::triangle) {
-    isect->u = 1.0f - intersection.triangle_barycentric_coord.y -
-               intersection.triangle_barycentric_coord.x;
-    isect->v = intersection.triangle_barycentric_coord.x;
-  }
-  else {
-    isect->u = payload.u;
-    isect->v = payload.v;
-  }
-
-  return isect->type != PRIMITIVE_NONE;
-
-#  else
-  if (!scene_intersect_valid(ray)) {
+  if (!intersection_ray_valid(ray)) {
    return false;
  }

@@ -809,44 +246,56 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
 #    endif /* __OBJECT_MOTION__ */

  return bvh_intersect_volume(kg, ray, isect, visibility);
-#  endif   /* __KERNEL_OPTIX__ */
 }
-#endif /* __VOLUME__ */
+#  endif /* defined(__VOLUME__) && !defined(__VOLUME_RECORD_ALL__) */

-#ifdef __VOLUME_RECORD_ALL__
-ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals kg,
-                                                     ccl_private const Ray *ray,
-                                                     ccl_private Intersection *isect,
-                                                     const uint max_hits,
-                                                     const uint visibility)
+/* Volume BVH traversal, for initializing or updating the volume stack.
+ * Variation that records multiple intersections at once. */
+
+#  if defined(__VOLUME__) && defined(__VOLUME_RECORD_ALL__)
+
+#    define BVH_FUNCTION_NAME bvh_intersect_volume_all
+#    define BVH_FUNCTION_FEATURES BVH_HAIR
+#    include "kernel/bvh/volume_all.h"
+
+#    if defined(__OBJECT_MOTION__)
+#      define BVH_FUNCTION_NAME bvh_intersect_volume_all_motion
+#      define BVH_FUNCTION_FEATURES BVH_MOTION | BVH_HAIR
+#      include "kernel/bvh/volume_all.h"
+#    endif
+
+ccl_device_intersect uint scene_intersect_volume(KernelGlobals kg,
+                                                 ccl_private const Ray *ray,
+                                                 ccl_private Intersection *isect,
+                                                 const uint max_hits,
+                                                 const uint visibility)
 {
-  if (!scene_intersect_valid(ray)) {
+  if (!intersection_ray_valid(ray)) {
    return false;
  }

-#  ifdef __EMBREE__
+#    ifdef __EMBREE__
  if (kernel_data.device_bvh) {
-    CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
-    ctx.isect_s = isect;
-    ctx.max_hits = max_hits;
-    ctx.num_hits = 0;
-    ctx.ray = ray;
-    IntersectContext rtc_ctx(&ctx);
-    RTCRay rtc_ray;
-    kernel_embree_setup_ray(*ray, rtc_ray, visibility);
-    rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
-    return ctx.num_hits;
+    return kernel_embree_intersect_volume(kg, ray, isect, max_hits, visibility);
  }
-#  endif /* __EMBREE__ */
+#    endif

-#  ifdef __OBJECT_MOTION__
+#    ifdef __OBJECT_MOTION__
  if (kernel_data.bvh.have_motion) {
    return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
  }
-#  endif /* __OBJECT_MOTION__ */
+#    endif /* __OBJECT_MOTION__ */

  return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
 }
-#endif /* __VOLUME_RECORD_ALL__ */
+
+#  endif /* defined(__VOLUME__) && defined(__VOLUME_RECORD_ALL__) */
+
+#  undef BVH_FEATURE
+#  undef BVH_NAME_JOIN
+#  undef BVH_NAME_EVAL
+#  undef BVH_FUNCTION_FULL_NAME
+
+#endif /* __BVH2__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/bvh/embree.h
+++ b/intern/cycles/kernel/bvh/embree.h
@@ -1,176 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2018-2022 Blender Foundation. */
-
-#pragma once
-
-#include <embree3/rtcore_ray.h>
-#include <embree3/rtcore_scene.h>
-
-#include "kernel/device/cpu/compat.h"
-#include "kernel/device/cpu/globals.h"
-
-#include "kernel/bvh/util.h"
-
-#include "util/vector.h"
-
-CCL_NAMESPACE_BEGIN
-
-struct CCLIntersectContext {
-  typedef enum {
-    RAY_REGULAR = 0,
-    RAY_SHADOW_ALL = 1,
-    RAY_LOCAL = 2,
-    RAY_SSS = 3,
-    RAY_VOLUME_ALL = 4,
-  } RayType;
-
-  KernelGlobals kg;
-  RayType type;
-
-  /* For avoiding self intersections */
-  const Ray *ray;
-
-  /* for shadow rays */
-  Intersection *isect_s;
-  uint max_hits;
-  uint num_hits;
-  uint num_recorded_hits;
-  float throughput;
-  float max_t;
-  bool opaque_hit;
-
-  /* for SSS Rays: */
-  LocalIntersection *local_isect;
-  int local_object_id;
-  uint *lcg_state;
-
-  CCLIntersectContext(KernelGlobals kg_, RayType type_)
-  {
-    kg = kg_;
-    type = type_;
-    ray = NULL;
-    max_hits = 1;
-    num_hits = 0;
-    num_recorded_hits = 0;
-    throughput = 1.0f;
-    max_t = FLT_MAX;
-    opaque_hit = false;
-    isect_s = NULL;
-    local_isect = NULL;
-    local_object_id = -1;
-    lcg_state = NULL;
-  }
-};
-
-class IntersectContext {
- public:
-  IntersectContext(CCLIntersectContext *ctx)
-  {
-    rtcInitIntersectContext(&context);
-    userRayExt = ctx;
-  }
-  RTCIntersectContext context;
-  CCLIntersectContext *userRayExt;
-};
-
-ccl_device_inline void kernel_embree_setup_ray(const Ray &ray,
-                                               RTCRay &rtc_ray,
-                                               const uint visibility)
-{
-  rtc_ray.org_x = ray.P.x;
-  rtc_ray.org_y = ray.P.y;
-  rtc_ray.org_z = ray.P.z;
-  rtc_ray.dir_x = ray.D.x;
-  rtc_ray.dir_y = ray.D.y;
-  rtc_ray.dir_z = ray.D.z;
-  rtc_ray.tnear = ray.tmin;
-  rtc_ray.tfar = ray.tmax;
-  rtc_ray.time = ray.time;
-  rtc_ray.mask = visibility;
-}
-
-ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
-                                                  RTCRayHit &rayhit,
-                                                  const uint visibility)
-{
-  kernel_embree_setup_ray(ray, rayhit.ray, visibility);
-  rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
-  rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
-}
-
-ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg,
-                                                          const RTCHit *hit,
-                                                          const Ray *ray)
-{
-  bool status = false;
-  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
-    const int oID = hit->instID[0] / 2;
-    if ((ray->self.object == oID) || (ray->self.light_object == oID)) {
-      RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
-          rtcGetGeometry(kernel_data.device_bvh, hit->instID[0]));
-      const int pID = hit->primID +
-                      (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
-      status = intersection_skip_self_shadow(ray->self, oID, pID);
-    }
-  }
-  else {
-    const int oID = hit->geomID / 2;
-    if ((ray->self.object == oID) || (ray->self.light_object == oID)) {
-      const int pID = hit->primID + (intptr_t)rtcGetGeometryUserData(
-                                        rtcGetGeometry(kernel_data.device_bvh, hit->geomID));
-      status = intersection_skip_self_shadow(ray->self, oID, pID);
-    }
-  }
-
-  return status;
-}
-
-ccl_device_inline void kernel_embree_convert_hit(KernelGlobals kg,
-                                                 const RTCRay *ray,
-                                                 const RTCHit *hit,
-                                                 Intersection *isect)
-{
-  isect->t = ray->tfar;
-  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
-    RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
-        rtcGetGeometry(kernel_data.device_bvh, hit->instID[0]));
-    isect->prim = hit->primID +
-                  (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
-    isect->object = hit->instID[0] / 2;
-  }
-  else {
-    isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(
-                                    rtcGetGeometry(kernel_data.device_bvh, hit->geomID));
-    isect->object = hit->geomID / 2;
-  }
-
-  const bool is_hair = hit->geomID & 1;
-  if (is_hair) {
-    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, isect->prim);
-    isect->type = segment.type;
-    isect->prim = segment.prim;
-    isect->u = hit->u;
-    isect->v = hit->v;
-  }
-  else {
-    isect->type = kernel_data_fetch(objects, isect->object).primitive_type;
-    isect->u = 1.0f - hit->v - hit->u;
-    isect->v = hit->u;
-  }
-}
-
-ccl_device_inline void kernel_embree_convert_sss_hit(
-    KernelGlobals kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, int object)
-{
-  isect->u = 1.0f - hit->v - hit->u;
-  isect->v = hit->u;
-  isect->t = ray->tfar;
-  RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
-      rtcGetGeometry(kernel_data.device_bvh, object * 2));
-  isect->prim = hit->primID +
-                (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
-  isect->object = object;
-  isect->type = kernel_data_fetch(objects, object).primitive_type;
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/kernel/bvh/local.h
+++ b/intern/cycles/kernel/bvh/local.h
@@ -59,14 +59,10 @@ ccl_device_inline
  const int object_flag = kernel_data_fetch(object_flag, local_object);
  if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
-    Transform ob_itfm;
-    const float t_world_to_instance = bvh_instance_motion_push(
-        kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
+    bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir);
 #else
-    const float t_world_to_instance = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
+    bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
 #endif
-    isect_t *= t_world_to_instance;
-    tmin *= t_world_to_instance;
    object = local_object;
  }

--- a/intern/cycles/kernel/bvh/metal.h
+++ b/intern/cycles/kernel/bvh/metal.h
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2021-2022 Blender Foundation */
-
-struct MetalRTIntersectionPayload {
-  RaySelfPrimitives self;
-  uint visibility;
-  float u, v;
-  int prim;
-  int type;
-#if defined(__METALRT_MOTION__)
-  float time;
-#endif
-};
-
-struct MetalRTIntersectionLocalPayload {
-  RaySelfPrimitives self;
-  uint local_object;
-  uint lcg_state;
-  short max_hits;
-  bool has_lcg_state;
-  bool result;
-  LocalIntersection local_isect;
-};
-
-struct MetalRTIntersectionShadowPayload {
-  RaySelfPrimitives self;
-  uint visibility;
-#if defined(__METALRT_MOTION__)
-  float time;
-#endif
-  int state;
-  float throughput;
-  short max_hits;
-  short num_hits;
-  short num_recorded_hits;
-  bool result;
-};
--- a/intern/cycles/kernel/bvh/shadow_all.h
+++ b/intern/cycles/kernel/bvh/shadow_all.h
@@ -53,23 +53,11 @@ ccl_device_inline
  int object = OBJECT_NONE;
  uint num_hits = 0;

-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
  /* Max distance in world space. May be dynamically reduced when max number of
   * recorded hits is exceeded and we no longer need to find hits beyond the max
   * distance found. */
-  float t_max_world = ray->tmax;
-
-  /* Current maximum distance to the intersection.
-   * Is calculated as a ray length, transformed to an object space when entering
-   * instance node. */
-  float t_max_current = ray->tmax;
-
-  /* Conversion from world to local space for the current instance if any, 1.0
-   * otherwise. */
-  float t_world_to_instance = 1.0f;
+  const float tmax = ray->tmax;
+  float tmax_hits = tmax;

  *r_num_recorded_hits = 0;
  *r_throughput = 1.0f;
@@ -90,7 +78,7 @@ ccl_device_inline
 #endif
                                       idir,
                                       tmin,
-                                       t_max_current,
+                                       tmax,
                                       node_addr,
                                       visibility,
                                       dist);
@@ -158,16 +146,8 @@ ccl_device_inline

            switch (type & PRIMITIVE_ALL) {
              case PRIMITIVE_TRIANGLE: {
-                hit = triangle_intersect(kg,
-                                         &isect,
-                                         P,
-                                         dir,
-                                         tmin,
-                                         t_max_current,
-                                         visibility,
-                                         prim_object,
-                                         prim,
-                                         prim_addr);
+                hit = triangle_intersect(
+                    kg, &isect, P, dir, tmin, tmax, visibility, prim_object, prim, prim_addr);
                break;
              }
 #if BVH_FEATURE(BVH_MOTION)
@@ -177,7 +157,7 @@ ccl_device_inline
                                                P,
                                                dir,
                                                tmin,
-                                                t_max_current,
+                                                tmax,
                                                ray->time,
                                                visibility,
                                                prim_object,
@@ -200,16 +180,8 @@ ccl_device_inline
                }

                const int curve_type = kernel_data_fetch(prim_type, prim_addr);
-                hit = curve_intersect(kg,
-                                      &isect,
-                                      P,
-                                      dir,
-                                      tmin,
-                                      t_max_current,
-                                      prim_object,
-                                      prim,
-                                      ray->time,
-                                      curve_type);
+                hit = curve_intersect(
+                    kg, &isect, P, dir, tmin, tmax, prim_object, prim, ray->time, curve_type);

                break;
              }
@@ -226,16 +198,8 @@ ccl_device_inline
                }

                const int point_type = kernel_data_fetch(prim_type, prim_addr);
-                hit = point_intersect(kg,
-                                      &isect,
-                                      P,
-                                      dir,
-                                      tmin,
-                                      t_max_current,
-                                      prim_object,
-                                      prim,
-                                      ray->time,
-                                      point_type);
+                hit = point_intersect(
+                    kg, &isect, P, dir, tmin, tmax, prim_object, prim, ray->time, point_type);
                break;
              }
 #endif /* BVH_FEATURE(BVH_POINTCLOUD) */
@@ -247,9 +211,6 @@ ccl_device_inline

            /* shadow ray early termination */
            if (hit) {
-              /* Convert intersection distance to world space. */
-              isect.t /= t_world_to_instance;
-
              /* detect if this surface has a shader with transparent shadows */
              /* todo: optimize so primitive visibility flag indicates if
               * the primitive has a transparent shadow shader? */
@@ -281,7 +242,7 @@ ccl_device_inline
              if (record_intersection) {
                /* Test if we need to record this transparent intersection. */
                const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
-                if (*r_num_recorded_hits < max_record_hits || isect.t < t_max_world) {
+                if (*r_num_recorded_hits < max_record_hits || isect.t < tmax_hits) {
                  /* If maximum number of hits was reached, replace the intersection with the
                   * highest distance. We want to find the N closest intersections. */
                  const uint num_recorded_hits = min(*r_num_recorded_hits, max_record_hits);
@@ -303,7 +264,7 @@ ccl_device_inline
                    }

                    /* Limit the ray distance and stop counting hits beyond this. */
-                    t_max_world = max(isect.t, max_t);
+                    tmax_hits = max(isect.t, max_t);
                  }

                  integrator_state_write_shadow_isect(state, &isect, isect_index);
@@ -321,16 +282,11 @@ ccl_device_inline
          object = kernel_data_fetch(prim_object, -prim_addr - 1);

 #if BVH_FEATURE(BVH_MOTION)
-          t_world_to_instance = bvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, &ob_itfm);
+          bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir);
 #else
-          t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+          bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif

-          /* Convert intersection to object space. */
-          t_max_current *= t_world_to_instance;
-          tmin *= t_world_to_instance;
-
          ++stack_ptr;
          kernel_assert(stack_ptr < BVH_STACK_SIZE);
          traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
@@ -344,18 +300,9 @@ ccl_device_inline
      kernel_assert(object != OBJECT_NONE);

      /* Instance pop. */
-#if BVH_FEATURE(BVH_MOTION)
-      bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#else
-      bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#endif
-
-      /* Restore world space ray length. */
-      tmin = ray->tmin;
-      t_max_current = ray->tmax;
+      bvh_instance_pop(ray, &P, &dir, &idir);

      object = OBJECT_NONE;
-      t_world_to_instance = 1.0f;
      node_addr = traversal_stack[stack_ptr];
      --stack_ptr;
    }
--- a/intern/cycles/kernel/bvh/traversal.h
+++ b/intern/cycles/kernel/bvh/traversal.h
@@ -43,13 +43,9 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
  float3 P = ray->P;
  float3 dir = bvh_clamp_direction(ray->D);
  float3 idir = bvh_inverse_direction(dir);
-  float tmin = ray->tmin;
+  const float tmin = ray->tmin;
  int object = OBJECT_NONE;

-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
  isect->t = ray->tmax;
  isect->u = 0.0f;
  isect->v = 0.0f;
@@ -223,15 +219,11 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
          object = kernel_data_fetch(prim_object, -prim_addr - 1);

 #if BVH_FEATURE(BVH_MOTION)
-          const float t_world_to_instance = bvh_instance_motion_push(
-              kg, object, ray, &P, &dir, &idir, &ob_itfm);
+          bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir);
 #else
-          const float t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+          bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif

-          isect->t *= t_world_to_instance;
-          tmin *= t_world_to_instance;
-
          ++stack_ptr;
          kernel_assert(stack_ptr < BVH_STACK_SIZE);
          traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
@@ -245,12 +237,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
      kernel_assert(object != OBJECT_NONE);

      /* instance pop */
-#if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#endif
-      tmin = ray->tmin;
+      bvh_instance_pop(ray, &P, &dir, &idir);

      object = OBJECT_NONE;
      node_addr = traversal_stack[stack_ptr];
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -5,20 +5,35 @@

 CCL_NAMESPACE_BEGIN

+ccl_device_inline bool intersection_ray_valid(ccl_private const Ray *ray)
+{
+  /* NOTE: Due to some vectorization code  non-finite origin point might
+   * cause lots of false-positive intersections which will overflow traversal
+   * stack.
+   * This code is a quick way to perform early output, to avoid crashes in
+   * such cases.
+   * From production scenes so far it seems it's enough to test first element
+   * only.
+   * Scene intersection may also called with empty rays for conditional trace
+   * calls that evaluate to false, so filter those out.
+   */
+  return isfinite_safe(ray->P.x) && isfinite_safe(ray->D.x) && len_squared(ray->D) != 0.0f;
+}
+
 /* Offset intersection distance by the smallest possible amount, to skip
 * intersections at this distance. This works in cases where the ray start
 * position is unchanged and only tmin is updated, since for self
 * intersection we'll be comparing against the exact same distances. */
 ccl_device_forceinline float intersection_t_offset(const float t)
 {
-  /* This is a simplified version of nextafterf(t, FLT_MAX), only dealing with
+  /* This is a simplified version of `nextafterf(t, FLT_MAX)`, only dealing with
   * non-negative and finite t. */
  kernel_assert(t >= 0.0f && isfinite_safe(t));
  const uint32_t bits = (t == 0.0f) ? 1 : __float_as_uint(t) + 1;
  return __uint_as_float(bits);
 }

-#if defined(__KERNEL_CPU__)
+#ifndef __KERNEL_GPU__
 ccl_device int intersections_compare(const void *a, const void *b)
 {
  const Intersection *isect_a = (const Intersection *)a;
--- a/intern/cycles/kernel/bvh/volume.h
+++ b/intern/cycles/kernel/bvh/volume.h
@@ -46,13 +46,9 @@ ccl_device_inline
  float3 P = ray->P;
  float3 dir = bvh_clamp_direction(ray->D);
  float3 idir = bvh_inverse_direction(dir);
-  float tmin = ray->tmin;
+  const float tmin = ray->tmin;
  int object = OBJECT_NONE;

-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
  isect->t = ray->tmax;
  isect->u = 0.0f;
  isect->v = 0.0f;
@@ -189,15 +185,11 @@ ccl_device_inline
          int object_flag = kernel_data_fetch(object_flag, object);
          if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            const float t_world_to_instance = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, &ob_itfm);
+            bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir);
 #else
-            const float t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+            bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif

-            isect->t *= t_world_to_instance;
-            tmin *= t_world_to_instance;
-
            ++stack_ptr;
            kernel_assert(stack_ptr < BVH_STACK_SIZE);
            traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
@@ -218,13 +210,7 @@ ccl_device_inline
      kernel_assert(object != OBJECT_NONE);

      /* instance pop */
-#if BVH_FEATURE(BVH_MOTION)
-      isect->t = bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, isect->t, &ob_itfm);
-#else
-      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
-#endif
-
-      tmin = ray->tmin;
+      bvh_instance_pop(ray, &P, &dir, &idir);

      object = OBJECT_NONE;
      node_addr = traversal_stack[stack_ptr];
--- a/intern/cycles/kernel/bvh/volume_all.h
+++ b/intern/cycles/kernel/bvh/volume_all.h
@@ -47,14 +47,10 @@ ccl_device_inline
  float3 P = ray->P;
  float3 dir = bvh_clamp_direction(ray->D);
  float3 idir = bvh_inverse_direction(dir);
-  float tmin = ray->tmin;
+  const float tmin = ray->tmin;
  int object = OBJECT_NONE;
  float isect_t = ray->tmax;

-#if BVH_FEATURE(BVH_MOTION)
-  Transform ob_itfm;
-#endif
-
  int num_hits_in_instance = 0;

  uint num_hits = 0;
@@ -159,18 +155,6 @@ ccl_device_inline
                  num_hits_in_instance++;
                  isect_array->t = isect_t;
                  if (num_hits == max_hits) {
-                    if (object != OBJECT_NONE) {
-#if BVH_FEATURE(BVH_MOTION)
-                      float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#else
-                      Transform itfm = object_fetch_transform(
-                          kg, object, OBJECT_INVERSE_TRANSFORM);
-                      float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#endif
-                      for (int i = 0; i < num_hits_in_instance; i++) {
-                        (isect_array - i - 1)->t *= t_fac;
-                      }
-                    }
                    return num_hits;
                  }
                }
@@ -212,18 +196,6 @@ ccl_device_inline
                  num_hits_in_instance++;
                  isect_array->t = isect_t;
                  if (num_hits == max_hits) {
-                    if (object != OBJECT_NONE) {
-#  if BVH_FEATURE(BVH_MOTION)
-                      float t_fac = 1.0f / len(transform_direction(&ob_itfm, dir));
-#  else
-                      Transform itfm = object_fetch_transform(
-                          kg, object, OBJECT_INVERSE_TRANSFORM);
-                      float t_fac = 1.0f / len(transform_direction(&itfm, dir));
-#  endif
-                      for (int i = 0; i < num_hits_in_instance; i++) {
-                        (isect_array - i - 1)->t *= t_fac;
-                      }
-                    }
                    return num_hits;
                  }
                }
@@ -242,15 +214,11 @@ ccl_device_inline
          int object_flag = kernel_data_fetch(object_flag, object);
          if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            const float t_world_to_instance = bvh_instance_motion_push(
-                kg, object, ray, &P, &dir, &idir, &ob_itfm);
+            bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir);
 #else
-            const float t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+            bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif

-            isect_t *= t_world_to_instance;
-            tmin *= t_world_to_instance;
-
            num_hits_in_instance = 0;
            isect_array->t = isect_t;

@@ -274,29 +242,7 @@ ccl_device_inline
      kernel_assert(object != OBJECT_NONE);

      /* Instance pop. */
-      if (num_hits_in_instance) {
-        float t_fac;
-#if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac, &ob_itfm);
-#else
-        bvh_instance_pop_factor(kg, object, ray, &P, &dir, &idir, &t_fac);
-#endif
-        /* Scale isect->t to adjust for instancing. */
-        for (int i = 0; i < num_hits_in_instance; i++) {
-          (isect_array - i - 1)->t *= t_fac;
-        }
-      }
-      else {
-#if BVH_FEATURE(BVH_MOTION)
-        bvh_instance_motion_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX, &ob_itfm);
-#else
-        bvh_instance_pop(kg, object, ray, &P, &dir, &idir, FLT_MAX);
-#endif
-      }
-
-      tmin = ray->tmin;
-      isect_t = ray->tmax;
-      isect_array->t = isect_t;
+      bvh_instance_pop(ray, &P, &dir, &idir);

      object = OBJECT_NONE;
      node_addr = traversal_stack[stack_ptr];
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -3,7 +3,7 @@

 #pragma once

-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 #  include <fenv.h>
 #endif

--- a/intern/cycles/kernel/data_template.h
+++ b/intern/cycles/kernel/data_template.h
@@ -70,7 +70,7 @@ KERNEL_STRUCT_MEMBER(film, float4, rec709_to_r)
 KERNEL_STRUCT_MEMBER(film, float4, rec709_to_g)
 KERNEL_STRUCT_MEMBER(film, float4, rec709_to_b)
 KERNEL_STRUCT_MEMBER(film, int, is_rec709)
-/* Exposuse. */
+/* Exposure. */
 KERNEL_STRUCT_MEMBER(film, float, exposure)
 /* Passed used. */
 KERNEL_STRUCT_MEMBER(film, int, pass_flag)
--- a/intern/cycles/kernel/device/cpu/bvh.h
+++ b/intern/cycles/kernel/device/cpu/bvh.h
@@ -0,0 +1,572 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+/* CPU Embree implementation of ray-scene intersection. */
+
+#pragma once
+
+#include <embree3/rtcore_ray.h>
+#include <embree3/rtcore_scene.h>
+
+#include "kernel/device/cpu/compat.h"
+#include "kernel/device/cpu/globals.h"
+
+#include "kernel/bvh/types.h"
+#include "kernel/bvh/util.h"
+#include "kernel/geom/object.h"
+#include "kernel/integrator/state.h"
+#include "kernel/sample/lcg.h"
+
+#include "util/vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define EMBREE_IS_HAIR(x) (x & 1)
+
+/* Intersection context. */
+
+struct CCLIntersectContext {
+  typedef enum {
+    RAY_REGULAR = 0,
+    RAY_SHADOW_ALL = 1,
+    RAY_LOCAL = 2,
+    RAY_SSS = 3,
+    RAY_VOLUME_ALL = 4,
+  } RayType;
+
+  KernelGlobals kg;
+  RayType type;
+
+  /* For avoiding self intersections */
+  const Ray *ray;
+
+  /* for shadow rays */
+  Intersection *isect_s;
+  uint max_hits;
+  uint num_hits;
+  uint num_recorded_hits;
+  float throughput;
+  float max_t;
+  bool opaque_hit;
+
+  /* for SSS Rays: */
+  LocalIntersection *local_isect;
+  int local_object_id;
+  uint *lcg_state;
+
+  CCLIntersectContext(KernelGlobals kg_, RayType type_)
+  {
+    kg = kg_;
+    type = type_;
+    ray = NULL;
+    max_hits = 1;
+    num_hits = 0;
+    num_recorded_hits = 0;
+    throughput = 1.0f;
+    max_t = FLT_MAX;
+    opaque_hit = false;
+    isect_s = NULL;
+    local_isect = NULL;
+    local_object_id = -1;
+    lcg_state = NULL;
+  }
+};
+
+class IntersectContext {
+ public:
+  IntersectContext(CCLIntersectContext *ctx)
+  {
+    rtcInitIntersectContext(&context);
+    userRayExt = ctx;
+  }
+  RTCIntersectContext context;
+  CCLIntersectContext *userRayExt;
+};
+
+/* Utilities. */
+
+ccl_device_inline void kernel_embree_setup_ray(const Ray &ray,
+                                               RTCRay &rtc_ray,
+                                               const uint visibility)
+{
+  rtc_ray.org_x = ray.P.x;
+  rtc_ray.org_y = ray.P.y;
+  rtc_ray.org_z = ray.P.z;
+  rtc_ray.dir_x = ray.D.x;
+  rtc_ray.dir_y = ray.D.y;
+  rtc_ray.dir_z = ray.D.z;
+  rtc_ray.tnear = ray.tmin;
+  rtc_ray.tfar = ray.tmax;
+  rtc_ray.time = ray.time;
+  rtc_ray.mask = visibility;
+}
+
+ccl_device_inline void kernel_embree_setup_rayhit(const Ray &ray,
+                                                  RTCRayHit &rayhit,
+                                                  const uint visibility)
+{
+  kernel_embree_setup_ray(ray, rayhit.ray, visibility);
+  rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
+  rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
+}
+
+ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg,
+                                                          const RTCHit *hit,
+                                                          const Ray *ray)
+{
+  bool status = false;
+  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
+    const int oID = hit->instID[0] / 2;
+    if ((ray->self.object == oID) || (ray->self.light_object == oID)) {
+      RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
+          rtcGetGeometry(kernel_data.device_bvh, hit->instID[0]));
+      const int pID = hit->primID +
+                      (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
+      status = intersection_skip_self_shadow(ray->self, oID, pID);
+    }
+  }
+  else {
+    const int oID = hit->geomID / 2;
+    if ((ray->self.object == oID) || (ray->self.light_object == oID)) {
+      const int pID = hit->primID + (intptr_t)rtcGetGeometryUserData(
+                                        rtcGetGeometry(kernel_data.device_bvh, hit->geomID));
+      status = intersection_skip_self_shadow(ray->self, oID, pID);
+    }
+  }
+
+  return status;
+}
+
+ccl_device_inline void kernel_embree_convert_hit(KernelGlobals kg,
+                                                 const RTCRay *ray,
+                                                 const RTCHit *hit,
+                                                 Intersection *isect)
+{
+  isect->t = ray->tfar;
+  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
+    RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
+        rtcGetGeometry(kernel_data.device_bvh, hit->instID[0]));
+    isect->prim = hit->primID +
+                  (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
+    isect->object = hit->instID[0] / 2;
+  }
+  else {
+    isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(
+                                    rtcGetGeometry(kernel_data.device_bvh, hit->geomID));
+    isect->object = hit->geomID / 2;
+  }
+
+  const bool is_hair = hit->geomID & 1;
+  if (is_hair) {
+    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, isect->prim);
+    isect->type = segment.type;
+    isect->prim = segment.prim;
+    isect->u = hit->u;
+    isect->v = hit->v;
+  }
+  else {
+    isect->type = kernel_data_fetch(objects, isect->object).primitive_type;
+    isect->u = hit->u;
+    isect->v = hit->v;
+  }
+}
+
+ccl_device_inline void kernel_embree_convert_sss_hit(
+    KernelGlobals kg, const RTCRay *ray, const RTCHit *hit, Intersection *isect, int object)
+{
+  isect->u = hit->u;
+  isect->v = hit->v;
+  isect->t = ray->tfar;
+  RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
+      rtcGetGeometry(kernel_data.device_bvh, object * 2));
+  isect->prim = hit->primID +
+                (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
+  isect->object = object;
+  isect->type = kernel_data_fetch(objects, object).primitive_type;
+}
+
+/* Ray filter functions. */
+
+/* This gets called by Embree at every valid ray/object intersection.
+ * Things like recording subsurface or shadow hits for later evaluation
+ * as well as filtering for volume objects happen here.
+ * Cycles' own BVH does that directly inside the traversal calls. */
+ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNArguments *args)
+{
+  /* Current implementation in Cycles assumes only single-ray intersection queries. */
+  assert(args->N == 1);
+
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+    *args->valid = 0;
+  }
+}
+
+/* This gets called by Embree at every valid ray/object intersection.
+ * Things like recording subsurface or shadow hits for later evaluation
+ * as well as filtering for volume objects happen here.
+ * Cycles' own BVH does that directly inside the traversal calls.
+ */
+ccl_device void kernel_embree_filter_occluded_func(const RTCFilterFunctionNArguments *args)
+{
+  /* Current implementation in Cycles assumes only single-ray intersection queries. */
+  assert(args->N == 1);
+
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  switch (ctx->type) {
+    case CCLIntersectContext::RAY_SHADOW_ALL: {
+      Intersection current_isect;
+      kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+      if (intersection_skip_self_shadow(cray->self, current_isect.object, current_isect.prim)) {
+        *args->valid = 0;
+        return;
+      }
+      /* If no transparent shadows or max number of hits exceeded, all light is blocked. */
+      const int flags = intersection_get_shader_flags(kg, current_isect.prim, current_isect.type);
+      if (!(flags & (SD_HAS_TRANSPARENT_SHADOW)) || ctx->num_hits >= ctx->max_hits) {
+        ctx->opaque_hit = true;
+        return;
+      }
+
+      ++ctx->num_hits;
+
+      /* Always use baked shadow transparency for curves. */
+      if (current_isect.type & PRIMITIVE_CURVE) {
+        ctx->throughput *= intersection_curve_shadow_transparency(
+            kg, current_isect.object, current_isect.prim, current_isect.u);
+
+        if (ctx->throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+          ctx->opaque_hit = true;
+          return;
+        }
+        else {
+          *args->valid = 0;
+          return;
+        }
+      }
+
+      /* Test if we need to record this transparent intersection. */
+      const uint max_record_hits = min(ctx->max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+      if (ctx->num_recorded_hits < max_record_hits || ray->tfar < ctx->max_t) {
+        /* If maximum number of hits was reached, replace the intersection with the
+         * highest distance. We want to find the N closest intersections. */
+        const uint num_recorded_hits = min(ctx->num_recorded_hits, max_record_hits);
+        uint isect_index = num_recorded_hits;
+        if (num_recorded_hits + 1 >= max_record_hits) {
+          float max_t = ctx->isect_s[0].t;
+          uint max_recorded_hit = 0;
+
+          for (uint i = 1; i < num_recorded_hits; ++i) {
+            if (ctx->isect_s[i].t > max_t) {
+              max_recorded_hit = i;
+              max_t = ctx->isect_s[i].t;
+            }
+          }
+
+          if (num_recorded_hits >= max_record_hits) {
+            isect_index = max_recorded_hit;
+          }
+
+          /* Limit the ray distance and stop counting hits beyond this.
+           * TODO: is there some way we can tell Embree to stop intersecting beyond
+           * this distance when max number of hits is reached?. Or maybe it will
+           * become irrelevant if we make max_hits a very high number on the CPU. */
+          ctx->max_t = max(current_isect.t, max_t);
+        }
+
+        ctx->isect_s[isect_index] = current_isect;
+      }
+
+      /* Always increase the number of recorded hits, even beyond the maximum,
+       * so that we can detect this and trace another ray if needed. */
+      ++ctx->num_recorded_hits;
+
+      /* This tells Embree to continue tracing. */
+      *args->valid = 0;
+      break;
+    }
+    case CCLIntersectContext::RAY_LOCAL:
+    case CCLIntersectContext::RAY_SSS: {
+      /* Check if it's hitting the correct object. */
+      Intersection current_isect;
+      if (ctx->type == CCLIntersectContext::RAY_SSS) {
+        kernel_embree_convert_sss_hit(kg, ray, hit, &current_isect, ctx->local_object_id);
+      }
+      else {
+        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+        if (ctx->local_object_id != current_isect.object) {
+          /* This tells Embree to continue tracing. */
+          *args->valid = 0;
+          break;
+        }
+      }
+      if (intersection_skip_self_local(cray->self, current_isect.prim)) {
+        *args->valid = 0;
+        return;
+      }
+
+      /* No intersection information requested, just return a hit. */
+      if (ctx->max_hits == 0) {
+        break;
+      }
+
+      /* Ignore curves. */
+      if (EMBREE_IS_HAIR(hit->geomID)) {
+        /* This tells Embree to continue tracing. */
+        *args->valid = 0;
+        break;
+      }
+
+      LocalIntersection *local_isect = ctx->local_isect;
+      int hit_idx = 0;
+
+      if (ctx->lcg_state) {
+        /* See triangle_intersect_subsurface() for the native equivalent. */
+        for (int i = min((int)ctx->max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+          if (local_isect->hits[i].t == ray->tfar) {
+            /* This tells Embree to continue tracing. */
+            *args->valid = 0;
+            return;
+          }
+        }
+
+        local_isect->num_hits++;
+
+        if (local_isect->num_hits <= ctx->max_hits) {
+          hit_idx = local_isect->num_hits - 1;
+        }
+        else {
+          /* reservoir sampling: if we are at the maximum number of
+           * hits, randomly replace element or skip it */
+          hit_idx = lcg_step_uint(ctx->lcg_state) % local_isect->num_hits;
+
+          if (hit_idx >= ctx->max_hits) {
+            /* This tells Embree to continue tracing. */
+            *args->valid = 0;
+            return;
+          }
+        }
+      }
+      else {
+        /* Record closest intersection only. */
+        if (local_isect->num_hits && current_isect.t > local_isect->hits[0].t) {
+          *args->valid = 0;
+          return;
+        }
+
+        local_isect->num_hits = 1;
+      }
+
+      /* record intersection */
+      local_isect->hits[hit_idx] = current_isect;
+      local_isect->Ng[hit_idx] = normalize(make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z));
+      /* This tells Embree to continue tracing. */
+      *args->valid = 0;
+      break;
+    }
+    case CCLIntersectContext::RAY_VOLUME_ALL: {
+      /* Append the intersection to the end of the array. */
+      if (ctx->num_hits < ctx->max_hits) {
+        Intersection current_isect;
+        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
+        if (intersection_skip_self(cray->self, current_isect.object, current_isect.prim)) {
+          *args->valid = 0;
+          return;
+        }
+
+        Intersection *isect = &ctx->isect_s[ctx->num_hits];
+        ++ctx->num_hits;
+        *isect = current_isect;
+        /* Only primitives from volume object. */
+        uint tri_object = isect->object;
+        int object_flag = kernel_data_fetch(object_flag, tri_object);
+        if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
+          --ctx->num_hits;
+        }
+        /* This tells Embree to continue tracing. */
+        *args->valid = 0;
+      }
+      break;
+    }
+    case CCLIntersectContext::RAY_REGULAR:
+    default:
+      if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+        *args->valid = 0;
+        return;
+      }
+      break;
+  }
+}
+
+ccl_device void kernel_embree_filter_func_backface_cull(const RTCFilterFunctionNArguments *args)
+{
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+
+  /* Always ignore back-facing intersections. */
+  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
+          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+    *args->valid = 0;
+    return;
+  }
+
+  CCLIntersectContext *ctx = ((IntersectContext *)args->context)->userRayExt;
+  const KernelGlobalsCPU *kg = ctx->kg;
+  const Ray *cray = ctx->ray;
+
+  if (kernel_embree_is_self_intersection(kg, hit, cray)) {
+    *args->valid = 0;
+  }
+}
+
+ccl_device void kernel_embree_filter_occluded_func_backface_cull(
+    const RTCFilterFunctionNArguments *args)
+{
+  const RTCRay *ray = (RTCRay *)args->ray;
+  RTCHit *hit = (RTCHit *)args->hit;
+
+  /* Always ignore back-facing intersections. */
+  if (dot(make_float3(ray->dir_x, ray->dir_y, ray->dir_z),
+          make_float3(hit->Ng_x, hit->Ng_y, hit->Ng_z)) > 0.0f) {
+    *args->valid = 0;
+    return;
+  }
+
+  kernel_embree_filter_occluded_func(args);
+}
+
+/* Scene intersection. */
+
+ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg,
+                                                  ccl_private const Ray *ray,
+                                                  const uint visibility,
+                                                  ccl_private Intersection *isect)
+{
+  isect->t = ray->tmax;
+  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
+  IntersectContext rtc_ctx(&ctx);
+  RTCRayHit ray_hit;
+  ctx.ray = ray;
+  kernel_embree_setup_rayhit(*ray, ray_hit, visibility);
+  rtcIntersect1(kernel_data.device_bvh, &rtc_ctx.context, &ray_hit);
+  if (ray_hit.hit.geomID == RTC_INVALID_GEOMETRY_ID ||
+      ray_hit.hit.primID == RTC_INVALID_GEOMETRY_ID) {
+    return false;
+  }
+
+  kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect);
+  return true;
+}
+
+#ifdef __BVH_LOCAL__
+ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
+                                                        ccl_private const Ray *ray,
+                                                        ccl_private LocalIntersection *local_isect,
+                                                        int local_object,
+                                                        ccl_private uint *lcg_state,
+                                                        int max_hits)
+{
+  const bool has_bvh = !(kernel_data_fetch(object_flag, local_object) &
+                         SD_OBJECT_TRANSFORM_APPLIED);
+  CCLIntersectContext ctx(kg,
+                          has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL);
+  ctx.lcg_state = lcg_state;
+  ctx.max_hits = max_hits;
+  ctx.ray = ray;
+  ctx.local_isect = local_isect;
+  if (local_isect) {
+    local_isect->num_hits = 0;
+  }
+  ctx.local_object_id = local_object;
+  IntersectContext rtc_ctx(&ctx);
+  RTCRay rtc_ray;
+  kernel_embree_setup_ray(*ray, rtc_ray, PATH_RAY_ALL_VISIBILITY);
+
+  /* If this object has its own BVH, use it. */
+  if (has_bvh) {
+    RTCGeometry geom = rtcGetGeometry(kernel_data.device_bvh, local_object * 2);
+    if (geom) {
+      float3 P = ray->P;
+      float3 dir = ray->D;
+      float3 idir = ray->D;
+      bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir);
+
+      rtc_ray.org_x = P.x;
+      rtc_ray.org_y = P.y;
+      rtc_ray.org_z = P.z;
+      rtc_ray.dir_x = dir.x;
+      rtc_ray.dir_y = dir.y;
+      rtc_ray.dir_z = dir.z;
+      rtc_ray.tnear = ray->tmin;
+      rtc_ray.tfar = ray->tmax;
+      RTCScene scene = (RTCScene)rtcGetGeometryUserData(geom);
+      kernel_assert(scene);
+      if (scene) {
+        rtcOccluded1(scene, &rtc_ctx.context, &rtc_ray);
+      }
+    }
+  }
+  else {
+    rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
+  }
+
+  /* rtcOccluded1 sets tfar to -inf if a hit was found. */
+  return (local_isect && local_isect->num_hits > 0) || (rtc_ray.tfar < 0);
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
+                                                             IntegratorShadowStateCPU *state,
+                                                             ccl_private const Ray *ray,
+                                                             uint visibility,
+                                                             uint max_hits,
+                                                             ccl_private uint *num_recorded_hits,
+                                                             ccl_private float *throughput)
+{
+  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
+  Intersection *isect_array = (Intersection *)state->shadow_isect;
+  ctx.isect_s = isect_array;
+  ctx.max_hits = max_hits;
+  ctx.ray = ray;
+  IntersectContext rtc_ctx(&ctx);
+  RTCRay rtc_ray;
+  kernel_embree_setup_ray(*ray, rtc_ray, visibility);
+  rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
+
+  *num_recorded_hits = ctx.num_recorded_hits;
+  *throughput = ctx.throughput;
+  return ctx.opaque_hit;
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg,
+                                                         ccl_private const Ray *ray,
+                                                         ccl_private Intersection *isect,
+                                                         const uint max_hits,
+                                                         const uint visibility)
+{
+  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
+  ctx.isect_s = isect;
+  ctx.max_hits = max_hits;
+  ctx.num_hits = 0;
+  ctx.ray = ray;
+  IntersectContext rtc_ctx(&ctx);
+  RTCRay rtc_ray;
+  kernel_embree_setup_ray(*ray, rtc_ray, visibility);
+  rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
+  return ctx.num_hits;
+}
+#endif
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/cpu/compat.h
+++ b/intern/cycles/kernel/device/cpu/compat.h
@@ -3,8 +3,6 @@

 #pragma once

-#define __KERNEL_CPU__
-
 /* Release kernel has too much false-positive maybe-uninitialized warnings,
 * which makes it possible to miss actual warnings.
 */
@@ -35,38 +33,4 @@ CCL_NAMESPACE_BEGIN

 #define kernel_assert(cond) assert(cond)

-/* Macros to handle different memory storage on different devices */
-
-#ifdef __KERNEL_SSE2__
-typedef vector3<sseb> sse3b;
-typedef vector3<ssef> sse3f;
-typedef vector3<ssei> sse3i;
-
-ccl_device_inline void print_sse3b(const char *label, sse3b &a)
-{
-  print_sseb(label, a.x);
-  print_sseb(label, a.y);
-  print_sseb(label, a.z);
-}
-
-ccl_device_inline void print_sse3f(const char *label, sse3f &a)
-{
-  print_ssef(label, a.x);
-  print_ssef(label, a.y);
-  print_ssef(label, a.z);
-}
-
-ccl_device_inline void print_sse3i(const char *label, sse3i &a)
-{
-  print_ssei(label, a.x);
-  print_ssei(label, a.y);
-  print_ssei(label, a.z);
-}
-
-#  if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
-typedef vector3<avxf> avx3f;
-#  endif
-
-#endif
-
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/metal/bvh.h
+++ b/intern/cycles/kernel/device/metal/bvh.h
@@ -0,0 +1,360 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+/* MetalRT implementation of ray-scene intersection. */
+
+#pragma once
+
+#include "kernel/bvh/types.h"
+#include "kernel/bvh/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Payload types. */
+
+struct MetalRTIntersectionPayload {
+  RaySelfPrimitives self;
+  uint visibility;
+  float u, v;
+  int prim;
+  int type;
+#if defined(__METALRT_MOTION__)
+  float time;
+#endif
+};
+
+struct MetalRTIntersectionLocalPayload {
+  RaySelfPrimitives self;
+  uint local_object;
+  uint lcg_state;
+  short max_hits;
+  bool has_lcg_state;
+  bool result;
+  LocalIntersection local_isect;
+};
+
+struct MetalRTIntersectionShadowPayload {
+  RaySelfPrimitives self;
+  uint visibility;
+#if defined(__METALRT_MOTION__)
+  float time;
+#endif
+  int state;
+  float throughput;
+  short max_hits;
+  short num_hits;
+  short num_recorded_hits;
+  bool result;
+};
+
+/* Scene intersection. */
+
+ccl_device_intersect bool scene_intersect(KernelGlobals kg,
+                                          ccl_private const Ray *ray,
+                                          const uint visibility,
+                                          ccl_private Intersection *isect)
+{
+  if (!intersection_ray_valid(ray)) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+    return false;
+  }
+
+#if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+    kernel_assert(!"Invalid ift_default");
+    return false;
+  }
+#endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionPayload payload;
+  payload.self = ray->self;
+  payload.u = 0.0f;
+  payload.v = 0.0f;
+  payload.visibility = visibility;
+
+  typename metalrt_intersector_type::result_type intersection;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+    /* No further intersector setup required: Default MetalRT behavior is any-hit. */
+  }
+  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    /* No further intersector setup required: Shadow ray early termination is controlled by the
+     * intersection handler */
+  }
+
+#if defined(__METALRT_MOTION__)
+  payload.time = ray->time;
+  intersection = metalrt_intersect.intersect(r,
+                                             metal_ancillaries->accel_struct,
+                                             ray_mask,
+                                             ray->time,
+                                             metal_ancillaries->ift_default,
+                                             payload);
+#else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload);
+#endif
+
+  if (intersection.type == intersection_type::none) {
+    isect->t = ray->tmax;
+    isect->type = PRIMITIVE_NONE;
+
+    return false;
+  }
+
+  isect->t = intersection.distance;
+
+  isect->prim = payload.prim;
+  isect->type = payload.type;
+  isect->object = intersection.user_instance_id;
+
+  isect->t = intersection.distance;
+  if (intersection.type == intersection_type::triangle) {
+    isect->u = intersection.triangle_barycentric_coord.x;
+    isect->v = intersection.triangle_barycentric_coord.y;
+  }
+  else {
+    isect->u = payload.u;
+    isect->v = payload.v;
+  }
+
+  return isect->type != PRIMITIVE_NONE;
+}
+
+#ifdef __BVH_LOCAL__
+ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
+                                                ccl_private const Ray *ray,
+                                                ccl_private LocalIntersection *local_isect,
+                                                int local_object,
+                                                ccl_private uint *lcg_state,
+                                                int max_hits)
+{
+  if (!intersection_ray_valid(ray)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    return false;
+  }
+
+#  if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_local)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid ift_local");
+    return false;
+  }
+#  endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionLocalPayload payload;
+  payload.self = ray->self;
+  payload.local_object = local_object;
+  payload.max_hits = max_hits;
+  payload.local_isect.num_hits = 0;
+  if (lcg_state) {
+    payload.has_lcg_state = true;
+    payload.lcg_state = *lcg_state;
+  }
+  payload.result = false;
+
+  typename metalrt_intersector_type::result_type intersection;
+
+#  if defined(__METALRT_MOTION__)
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
+#  else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
+#  endif
+
+  if (lcg_state) {
+    *lcg_state = payload.lcg_state;
+  }
+  *local_isect = payload.local_isect;
+
+  return payload.result;
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
+                                                     IntegratorShadowState state,
+                                                     ccl_private const Ray *ray,
+                                                     uint visibility,
+                                                     uint max_hits,
+                                                     ccl_private uint *num_recorded_hits,
+                                                     ccl_private float *throughput)
+{
+  if (!intersection_ray_valid(ray)) {
+    return false;
+  }
+
+#  if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_shadow)) {
+    kernel_assert(!"Invalid ift_shadow");
+    return false;
+  }
+#  endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionShadowPayload payload;
+  payload.self = ray->self;
+  payload.visibility = visibility;
+  payload.max_hits = max_hits;
+  payload.num_hits = 0;
+  payload.num_recorded_hits = 0;
+  payload.throughput = 1.0f;
+  payload.result = false;
+  payload.state = state;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  typename metalrt_intersector_type::result_type intersection;
+
+#  if defined(__METALRT_MOTION__)
+  payload.time = ray->time;
+  intersection = metalrt_intersect.intersect(r,
+                                             metal_ancillaries->accel_struct,
+                                             ray_mask,
+                                             ray->time,
+                                             metal_ancillaries->ift_shadow,
+                                             payload);
+#  else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_shadow, payload);
+#  endif
+
+  *num_recorded_hits = payload.num_recorded_hits;
+  *throughput = payload.throughput;
+
+  return payload.result;
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
+                                                 ccl_private const Ray *ray,
+                                                 ccl_private Intersection *isect,
+                                                 const uint visibility)
+{
+  if (!intersection_ray_valid(ray)) {
+    return false;
+  }
+
+#  if defined(__KERNEL_DEBUG__)
+  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
+    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
+    return false;
+  }
+
+  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
+    kernel_assert(!"Invalid ift_default");
+    return false;
+  }
+#  endif
+
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+  metalrt_intersector_type metalrt_intersect;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  if (!kernel_data.bvh.have_curves) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  MetalRTIntersectionPayload payload;
+  payload.self = ray->self;
+  payload.visibility = visibility;
+
+  typename metalrt_intersector_type::result_type intersection;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+#  if defined(__METALRT_MOTION__)
+  payload.time = ray->time;
+  intersection = metalrt_intersect.intersect(r,
+                                             metal_ancillaries->accel_struct,
+                                             ray_mask,
+                                             ray->time,
+                                             metal_ancillaries->ift_default,
+                                             payload);
+#  else
+  intersection = metalrt_intersect.intersect(
+      r, metal_ancillaries->accel_struct, ray_mask, metal_ancillaries->ift_default, payload);
+#  endif
+
+  if (intersection.type == intersection_type::none) {
+    return false;
+  }
+
+  isect->prim = payload.prim;
+  isect->type = payload.type;
+  isect->object = intersection.user_instance_id;
+
+  isect->t = intersection.distance;
+  if (intersection.type == intersection_type::triangle) {
+    isect->u = intersection.triangle_barycentric_coord.x;
+    isect->v = intersection.triangle_barycentric_coord.y;
+  }
+  else {
+    isect->u = payload.u;
+    isect->v = payload.v;
+  }
+
+  return isect->type != PRIMITIVE_NONE;
+}
+#endif
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -260,8 +260,6 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \

 #ifdef __METALRT__

-#  define __KERNEL_GPU_RAYTRACING__
-
 #  if defined(__METALRT_MOTION__)
 #    define METALRT_TAGS instancing, instance_motion, primitive_motion
 #  else
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -1,41 +1,44 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2021-2022 Blender Foundation */

-/* Metal kernel entry points */
+/* Metal kernel entry points. */

 #include "kernel/device/metal/compat.h"
 #include "kernel/device/metal/globals.h"
 #include "kernel/device/metal/function_constants.h"
 #include "kernel/device/gpu/kernel.h"

-/* MetalRT intersection handlers */
+/* MetalRT intersection handlers. */
+
 #ifdef __METALRT__

-/* Return type for a bounding box intersection function. */
-struct BoundingBoxIntersectionResult
-{
+/* Intersection return types. */
+
+/* For a bounding box intersection function. */
+struct BoundingBoxIntersectionResult {
  bool accept [[accept_intersection]];
  bool continue_search [[continue_search]];
  float distance [[distance]];
 };

-/* Return type for a triangle intersection function. */
-struct TriangleIntersectionResult
-{
+/* For a triangle intersection function. */
+struct TriangleIntersectionResult {
  bool accept [[accept_intersection]];
-  bool continue_search  [[continue_search]];
+  bool continue_search [[continue_search]];
 };

 enum { METALRT_HIT_TRIANGLE, METALRT_HIT_BOUNDING_BOX };

-ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives& self,
+/* Utilities. */
+
+ccl_device_inline bool intersection_skip_self(ray_data const RaySelfPrimitives &self,
                                              const int object,
                                              const int prim)
 {
  return (self.prim == prim) && (self.object == object);
 }

-ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives& self,
+ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimitives &self,
                                                     const int object,
                                                     const int prim)
 {
@@ -43,12 +46,14 @@ ccl_device_inline bool intersection_skip_self_shadow(ray_data const RaySelfPrimi
         ((self.light_prim == prim) && (self.light_object == object));
 }

-ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives& self,
+ccl_device_inline bool intersection_skip_self_local(ray_data const RaySelfPrimitives &self,
                                                    const int prim)
 {
  return (self.prim == prim);
 }

+/* Hit functions. */
+
 template<typename TReturn, uint intersection_type>
 TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
                          ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload,
@@ -58,7 +63,7 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
                          const float ray_tmax)
 {
  TReturn result;
-  
+
 #ifdef __BVH_LOCAL__
  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);

@@ -101,7 +106,8 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
  }
  else {
    if (payload.local_isect.num_hits && ray_tmax > payload.local_isect.hits[0].t) {
-      /* Record closest intersection only. Do not terminate ray here, since there is no guarantee about distance ordering in any-hit */
+      /* Record closest intersection only. Do not terminate ray here, since there is no guarantee
+       * about distance ordering in any-hit */
      result.accept = false;
      result.continue_search = true;
      return result;
@@ -116,8 +122,8 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
  isect->object = object;
  isect->type = kernel_data_fetch(objects, object).primitive_type;

-  isect->u = 1.0f - barycentrics.y - barycentrics.x;
-  isect->v = barycentrics.x;
+  isect->u = barycentrics.x;
+  isect->v = barycentrics.y;

  /* Record geometric normal */
  const uint tri_vindex = kernel_data_fetch(tri_vindex, isect->prim).w;
@@ -133,21 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
 #endif
 }

-[[intersection(triangle, triangle_data, METALRT_TAGS)]]
-TriangleIntersectionResult
-__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                       ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
-                                       uint instance_id [[user_instance_id]],
-                                       uint primitive_id [[primitive_id]],
-                                       float2 barycentrics [[barycentric_coord]],
-                                       float ray_tmax [[distance]])
+[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_local_hit_tri(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
+    uint instance_id [[user_instance_id]],
+    uint primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
 {
  return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
-            launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax);
+      launch_params_metal, payload, instance_id, primitive_id, barycentrics, ray_tmax);
 }

-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
 {
  /* unused function */
@@ -180,18 +185,14 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
    return true;
  }

-  float u = 0.0f, v = 0.0f;
+  const float u = barycentrics.x;
+  const float v = barycentrics.y;
  int type = 0;
  if (intersection_type == METALRT_HIT_TRIANGLE) {
-    u = 1.0f - barycentrics.y - barycentrics.x;
-    v = barycentrics.x;
    type = kernel_data_fetch(objects, object).primitive_type;
  }
 #  ifdef __HAIR__
  else {
-    u = barycentrics.x;
-    v = barycentrics.y;
-    
    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
    type = segment.type;
    prim = segment.prim;
@@ -215,7 +216,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
  short num_recorded_hits = payload.num_recorded_hits;

  MetalKernelContext context(launch_params_metal);
-  
+
  /* If no transparent shadows, all light is blocked and we can stop immediately. */
  if (num_hits >= max_hits ||
      !(context.intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
@@ -223,7 +224,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
    /* terminate ray */
    return false;
  }
-  
+
  /* Always use baked shadow transparency for curves. */
  if (type & PRIMITIVE_CURVE) {
    float throughput = payload.throughput;
@@ -240,10 +241,10 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
      return true;
    }
  }
-  
+
  payload.num_hits += 1;
  payload.num_recorded_hits += 1;
-  
+
  uint record_index = num_recorded_hits;

  const IntegratorShadowState state = payload.state;
@@ -278,7 +279,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
-  
+
  /* Continue tracing. */
 #  endif /* __TRANSPARENT_SHADOWS__ */
 #endif   /* __SHADOW_RECORD_ALL__ */
@@ -286,26 +287,25 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
  return true;
 }

-[[intersection(triangle, triangle_data, METALRT_TAGS)]]
-TriangleIntersectionResult
-__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                            unsigned int object [[user_instance_id]],
-                                            unsigned int primitive_id [[primitive_id]],
-                                            float2 barycentrics [[barycentric_coord]],
-                                            float ray_tmax [[distance]])
+[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_shadow_all_hit_tri(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+    unsigned int object [[user_instance_id]],
+    unsigned int primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
 {
  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);

  TriangleIntersectionResult result;
  result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_TRIANGLE>(
-            launch_params_metal, payload, object, prim, barycentrics, ray_tmax);
+      launch_params_metal, payload, object, prim, barycentrics, ray_tmax);
  result.accept = !result.continue_search;
  return result;
 }

-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
 {
  /* unused function */
@@ -317,15 +317,16 @@ __anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]
 }

 template<typename TReturnType, uint intersection_type>
-inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_params_metal,
-                                           ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
-                                           const uint object,
-                                           const uint prim,
-                                           const float u)
+inline TReturnType metalrt_visibility_test(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+    const uint object,
+    const uint prim,
+    const float u)
 {
  TReturnType result;
-    
-#  ifdef __HAIR__
+
+#ifdef __HAIR__
  if (intersection_type == METALRT_HIT_BOUNDING_BOX) {
    /* Filter out curve endcaps. */
    if (u == 0.0f || u == 1.0f) {
@@ -334,16 +335,16 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa
      return result;
    }
  }
-#  endif
+#endif

  uint visibility = payload.visibility;
-#  ifdef __VISIBILITY_FLAG__
+#ifdef __VISIBILITY_FLAG__
  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
    result.accept = false;
    result.continue_search = true;
    return result;
  }
-#  endif
+#endif

  /* Shadow ray early termination. */
  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
@@ -371,16 +372,17 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa
  return result;
 }

-[[intersection(triangle, triangle_data, METALRT_TAGS)]]
-TriangleIntersectionResult
-__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
-                                             unsigned int object [[user_instance_id]],
-                                             unsigned int primitive_id [[primitive_id]])
+[[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_visibility_test_tri(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+    unsigned int object [[user_instance_id]],
+    unsigned int primitive_id [[primitive_id]])
 {
  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
-  TriangleIntersectionResult result = metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
-            launch_params_metal, payload, object, prim, 0.0f);
+  TriangleIntersectionResult result =
+      metalrt_visibility_test<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+          launch_params_metal, payload, object, prim, 0.0f);
  if (result.accept) {
    payload.prim = prim;
    payload.type = kernel_data_fetch(objects, object).primitive_type;
@@ -388,8 +390,7 @@ __anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_
  return result;
 }

-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
 {
  /* Unused function */
@@ -400,19 +401,21 @@ __anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance
  return result;
 }

+/* Primitive intersection functions. */
+
 #ifdef __HAIR__
-ccl_device_inline
-void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
-                                ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
-                                const uint object,
-                                const uint prim,
-                                const uint type,
-                                const float3 ray_origin,
-                                const float3 ray_direction,
-                                float time,
-                                const float ray_tmin,
-                                const float ray_tmax,
-                                thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_curve(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
 #  ifdef __VISIBILITY_FLAG__
  const uint visibility = payload.visibility;
@@ -421,25 +424,16 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
  }
 #  endif

-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the curve intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
  Intersection isect;
  isect.t = ray_tmax;
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;

  MetalKernelContext context(launch_params_metal);
-  if (context.curve_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
+  if (context.curve_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
    result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
-                  launch_params_metal, payload, object, prim, isect.u);
+        launch_params_metal, payload, object, prim, isect.u);
    if (result.accept) {
-      result.distance = isect.t / len;
+      result.distance = isect.t;
      payload.u = isect.u;
      payload.v = isect.v;
      payload.prim = prim;
@@ -448,54 +442,41 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
  }
 }

-ccl_device_inline
-void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params_metal,
-                                       ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
-                                       const uint object,
-                                       const uint prim,
-                                       const uint type,
-                                       const float3 ray_origin,
-                                       const float3 ray_direction,
-                                       float time,
-                                       const float ray_tmin,
-                                       const float ray_tmax,
-                                       thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_curve_shadow(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
  const uint visibility = payload.visibility;

-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the curve intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
  Intersection isect;
  isect.t = ray_tmax;
-  /* Transform maximum distance into object space */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;

  MetalKernelContext context(launch_params_metal);
-  if (context.curve_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
+  if (context.curve_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
    result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
-                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+        launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
    result.accept = !result.continue_search;
-
-    if (result.accept) {
-      result.distance = isect.t / len;
-    }
  }
 }

-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload
+                             [[payload]],
                             const uint object [[user_instance_id]],
                             const uint primitive_id [[primitive_id]],
-                             const float3 ray_origin [[origin]],
-                             const float3 ray_direction [[direction]],
+                             const float3 ray_P [[origin]],
+                             const float3 ray_D [[direction]],
                             const float ray_tmin [[min_distance]],
                             const float ray_tmax [[max_distance]])
 {
@@ -508,28 +489,36 @@ __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[b
  result.distance = ray_tmax;

  if (segment.type & PRIMITIVE_CURVE_RIBBON) {
-    metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+    metalrt_intersection_curve(launch_params_metal,
+                               payload,
+                               object,
+                               segment.prim,
+                               segment.type,
+                               ray_P,
+                               ray_D,
 #  if defined(__METALRT_MOTION__)
                               payload.time,
 #  else
                               0.0f,
 #  endif
-                               ray_tmin, ray_tmax, result);
+                               ray_tmin,
+                               ray_tmax,
+                               result);
  }

  return result;
 }

-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
-__intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                    const uint object [[user_instance_id]],
-                                    const uint primitive_id [[primitive_id]],
-                                    const float3 ray_origin [[origin]],
-                                    const float3 ray_direction [[direction]],
-                                    const float ray_tmin [[min_distance]],
-                                    const float ray_tmax [[max_distance]])
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
+__intersection__curve_ribbon_shadow(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+    const uint object [[user_instance_id]],
+    const uint primitive_id [[primitive_id]],
+    const float3 ray_P [[origin]],
+    const float3 ray_D [[direction]],
+    const float ray_tmin [[min_distance]],
+    const float ray_tmax [[max_distance]])
 {
  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
@@ -540,57 +529,73 @@ __intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_me
  result.distance = ray_tmax;

  if (segment.type & PRIMITIVE_CURVE_RIBBON) {
-    metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+    metalrt_intersection_curve_shadow(launch_params_metal,
+                                      payload,
+                                      object,
+                                      segment.prim,
+                                      segment.type,
+                                      ray_P,
+                                      ray_D,
 #  if defined(__METALRT_MOTION__)
-                               payload.time,
+                                      payload.time,
 #  else
-                               0.0f,
+                                      0.0f,
 #  endif
-                               ray_tmin, ray_tmax, result);
+                                      ray_tmin,
+                                      ray_tmax,
+                                      result);
  }

  return result;
 }

-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                          ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                          ray_data MetalKernelContext::MetalRTIntersectionPayload &payload
+                          [[payload]],
                          const uint object [[user_instance_id]],
                          const uint primitive_id [[primitive_id]],
-                          const float3 ray_origin [[origin]],
-                          const float3 ray_direction [[direction]],
+                          const float3 ray_P [[origin]],
+                          const float3 ray_D [[direction]],
                          const float ray_tmin [[min_distance]],
                          const float ray_tmax [[max_distance]])
 {
  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
-    
+
  BoundingBoxIntersectionResult result;
  result.accept = false;
  result.continue_search = true;
  result.distance = ray_tmax;
-  metalrt_intersection_curve(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+  metalrt_intersection_curve(launch_params_metal,
+                             payload,
+                             object,
+                             segment.prim,
+                             segment.type,
+                             ray_P,
+                             ray_D,
 #  if defined(__METALRT_MOTION__)
                             payload.time,
 #  else
                             0.0f,
 #  endif
-                             ray_tmin, ray_tmax, result);
+                             ray_tmin,
+                             ray_tmax,
+                             result);

  return result;
 }

-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
-__intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                 ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                 const uint object [[user_instance_id]],
-                                 const uint primitive_id [[primitive_id]],
-                                 const float3 ray_origin [[origin]],
-                                 const float3 ray_direction [[direction]],
-                                 const float ray_tmin [[min_distance]],
-                                 const float ray_tmax [[max_distance]])
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
+__intersection__curve_all_shadow(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
+    const uint object [[user_instance_id]],
+    const uint primitive_id [[primitive_id]],
+    const float3 ray_P [[origin]],
+    const float3 ray_D [[direction]],
+    const float ray_tmin [[min_distance]],
+    const float ray_tmax [[max_distance]])
 {
  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
@@ -600,31 +605,39 @@ __intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal
  result.continue_search = true;
  result.distance = ray_tmax;

-  metalrt_intersection_curve_shadow(launch_params_metal, payload, object, segment.prim, segment.type, ray_origin, ray_direction,
+  metalrt_intersection_curve_shadow(launch_params_metal,
+                                    payload,
+                                    object,
+                                    segment.prim,
+                                    segment.type,
+                                    ray_P,
+                                    ray_D,
 #  if defined(__METALRT_MOTION__)
-                             payload.time,
+                                    payload.time,
 #  else
-                             0.0f,
+                                    0.0f,
 #  endif
-                             ray_tmin, ray_tmax, result);
+                                    ray_tmin,
+                                    ray_tmax,
+                                    result);

  return result;
 }
 #endif /* __HAIR__ */

 #ifdef __POINTCLOUD__
-ccl_device_inline
-void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
-                                ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
-                                const uint object,
-                                const uint prim,
-                                const uint type,
-                                const float3 ray_origin,
-                                const float3 ray_direction,
-                                float time,
-                                const float ray_tmin,
-                                const float ray_tmax,
-                                thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_point(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
 #  ifdef __VISIBILITY_FLAG__
  const uint visibility = payload.visibility;
@@ -633,25 +646,16 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
  }
 #  endif

-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the point intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
  Intersection isect;
  isect.t = ray_tmax;
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;

  MetalKernelContext context(launch_params_metal);
-  if (context.point_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
+  if (context.point_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
    result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
-                  launch_params_metal, payload, object, prim, isect.u);
+        launch_params_metal, payload, object, prim, isect.u);
    if (result.accept) {
-      result.distance = isect.t / len;
+      result.distance = isect.t;
      payload.u = isect.u;
      payload.v = isect.v;
      payload.prim = prim;
@@ -660,50 +664,78 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
  }
 }

-ccl_device_inline
-void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params_metal,
-                                       ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
-                                       const uint object,
-                                       const uint prim,
-                                       const uint type,
-                                       const float3 ray_origin,
-                                       const float3 ray_direction,
-                                       float time,
-                                       const float ray_tmin,
-                                       const float ray_tmax,
-                                       thread BoundingBoxIntersectionResult &result)
+ccl_device_inline void metalrt_intersection_point_shadow(
+    constant KernelParamsMetal &launch_params_metal,
+    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
+    const uint object,
+    const uint prim,
+    const uint type,
+    const float3 ray_P,
+    const float3 ray_D,
+    float time,
+    const float ray_tmin,
+    const float ray_tmax,
+    thread BoundingBoxIntersectionResult &result)
 {
  const uint visibility = payload.visibility;

-  float3 P = ray_origin;
-  float3 dir = ray_direction;
-
-  /* The direction is not normalized by default, but the point intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
  Intersection isect;
  isect.t = ray_tmax;
-  /* Transform maximum distance into object space */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;

  MetalKernelContext context(launch_params_metal);
-  if (context.point_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
+  if (context.point_intersect(
+          NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
    result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
-                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
+        launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
    result.accept = !result.continue_search;

    if (result.accept) {
-      result.distance = isect.t / len;
+      result.distance = isect.t;
    }
  }
 }

-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
 __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                             ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                      ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
+                      const uint object [[user_instance_id]],
+                      const uint primitive_id [[primitive_id]],
+                      const float3 ray_origin [[origin]],
+                      const float3 ray_direction [[direction]],
+                      const float ray_tmin [[min_distance]],
+                      const float ray_tmax [[max_distance]])
+{
+  const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
+  const int type = kernel_data_fetch(objects, object).primitive_type;
+
+  BoundingBoxIntersectionResult result;
+  result.accept = false;
+  result.continue_search = true;
+  result.distance = ray_tmax;
+
+  metalrt_intersection_point(launch_params_metal,
+                             payload,
+                             object,
+                             prim,
+                             type,
+                             ray_origin,
+                             ray_direction,
+#  if defined(__METALRT_MOTION__)
+                             payload.time,
+#  else
+                             0.0f,
+#  endif
+                             ray_tmin,
+                             ray_tmax,
+                             result);
+
+  return result;
+}
+
+[[intersection(bounding_box, triangle_data, METALRT_TAGS)]] BoundingBoxIntersectionResult
+__intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+                             ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload
+                             [[payload]],
                             const uint object [[user_instance_id]],
                             const uint primitive_id [[primitive_id]],
                             const float3 ray_origin [[origin]],
@@ -719,43 +751,21 @@ __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1
  result.continue_search = true;
  result.distance = ray_tmax;

-  metalrt_intersection_point(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
+  metalrt_intersection_point_shadow(launch_params_metal,
+                                    payload,
+                                    object,
+                                    prim,
+                                    type,
+                                    ray_origin,
+                                    ray_direction,
 #  if defined(__METALRT_MOTION__)
-                             payload.time,
+                                    payload.time,
 #  else
-                             0.0f,
+                                    0.0f,
 #  endif
-                             ray_tmin, ray_tmax, result);
-
-  return result;
-}
-
-[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
-BoundingBoxIntersectionResult
-__intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
-                                    ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
-                                    const uint object [[user_instance_id]],
-                                    const uint primitive_id [[primitive_id]],
-                                    const float3 ray_origin [[origin]],
-                                    const float3 ray_direction [[direction]],
-                                    const float ray_tmin [[min_distance]],
-                                    const float ray_tmax [[max_distance]])
-{
-  const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
-  const int type = kernel_data_fetch(objects, object).primitive_type;
-
-  BoundingBoxIntersectionResult result;
-  result.accept = false;
-  result.continue_search = true;
-  result.distance = ray_tmax;
-
-  metalrt_intersection_point_shadow(launch_params_metal, payload, object, prim, type, ray_origin, ray_direction,
-#  if defined(__METALRT_MOTION__)
-                             payload.time,
-#  else
-                             0.0f,
-#  endif
-                             ray_tmin, ray_tmax, result);
+                                    ray_tmin,
+                                    ray_tmax,
+                                    result);

  return result;
 }
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -149,25 +149,13 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
 /* clang-format on */

 /* Types */
+
 /* It's not possible to use sycl types like sycl::float3, sycl::int3, etc
- * because these types have different interfaces from blender version */
+ * because these types have different interfaces from blender version. */

 using uchar = unsigned char;
 using sycl::half;

-struct float3 {
-  float x, y, z;
-};
-
-ccl_always_inline float3 make_float3(float x, float y, float z)
-{
-  return {x, y, z};
-}
-ccl_always_inline float3 make_float3(float x)
-{
-  return {x, x, x};
-}
-
 /* math functions */
 #define fabsf(x) sycl::fabs((x))
 #define copysignf(x, y) sycl::copysign((x), (y))
--- a/intern/cycles/kernel/device/oneapi/dll_interface_template.h
+++ b/intern/cycles/kernel/device/oneapi/dll_interface_template.h
@@ -6,7 +6,8 @@ DLL_INTERFACE_CALL(oneapi_device_capabilities, char *)
 DLL_INTERFACE_CALL(oneapi_free, void, void *)
 DLL_INTERFACE_CALL(oneapi_get_memcapacity, size_t, SyclQueue *queue)

-DLL_INTERFACE_CALL(oneapi_get_compute_units_amount, size_t, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_get_num_multiprocessors, int, SyclQueue *queue)
+DLL_INTERFACE_CALL(oneapi_get_max_num_threads_per_multiprocessor, int, SyclQueue *queue)
 DLL_INTERFACE_CALL(oneapi_iterate_devices, void, OneAPIDeviceIteratorCallback cb, void *user_ptr)
 DLL_INTERFACE_CALL(oneapi_set_error_cb, void, OneAPIErrorCallback, void *user_ptr)

--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -904,11 +904,26 @@ size_t oneapi_get_memcapacity(SyclQueue *queue)
      .get_info<sycl::info::device::global_mem_size>();
 }

-size_t oneapi_get_compute_units_amount(SyclQueue *queue)
+int oneapi_get_num_multiprocessors(SyclQueue *queue)
 {
-  return reinterpret_cast<sycl::queue *>(queue)
-      ->get_device()
-      .get_info<sycl::info::device::max_compute_units>();
+  const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
+  if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
+    return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
+  }
+  else
+    return 0;
+}
+
+int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue)
+{
+  const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
+  if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
+      device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+    return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
+           device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
+  }
+  else
+    return 0;
 }

 #endif /* WITH_ONEAPI */
--- a/intern/cycles/kernel/device/optix/bvh.h
+++ b/intern/cycles/kernel/device/optix/bvh.h
@@ -0,0 +1,646 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+/* OptiX implementation of ray-scene intersection. */
+
+#pragma once
+
+#include "kernel/bvh/types.h"
+#include "kernel/bvh/util.h"
+
+#define OPTIX_DEFINE_ABI_VERSION_ONLY
+#include <optix_function_table.h>
+
+CCL_NAMESPACE_BEGIN
+
+/* Utilities. */
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
+{
+  return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
+}
+template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
+{
+  return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
+}
+
+template<typename T> ccl_device_forceinline T *get_payload_ptr_6()
+{
+  return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6());
+}
+
+ccl_device_forceinline int get_object_id()
+{
+#ifdef __OBJECT_MOTION__
+  /* Always get the instance ID from the TLAS
+   * There might be a motion transform node between TLAS and BLAS which does not have one. */
+  return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
+#else
+  return optixGetInstanceId();
+#endif
+}
+
+/* Hit/miss functions. */
+
+extern "C" __global__ void __miss__kernel_optix_miss()
+{
+  /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
+  optixSetPayload_5(PRIMITIVE_NONE);
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_local_hit()
+{
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves and points. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
+#ifdef __BVH_LOCAL__
+  const int object = get_object_id();
+  if (object != optixGetPayload_4() /* local_object */) {
+    /* Only intersect with matching object. */
+    return optixIgnoreIntersection();
+  }
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self_local(ray->self, prim)) {
+    return optixIgnoreIntersection();
+  }
+
+  const uint max_hits = optixGetPayload_5();
+  if (max_hits == 0) {
+    /* Special case for when no hit information is requested, just report that something was hit */
+    optixSetPayload_5(true);
+    return optixTerminateRay();
+  }
+
+  int hit = 0;
+  uint *const lcg_state = get_payload_ptr_0<uint>();
+  LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>();
+
+  if (lcg_state) {
+    for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
+      if (optixGetRayTmax() == local_isect->hits[i].t) {
+        return optixIgnoreIntersection();
+      }
+    }
+
+    hit = local_isect->num_hits++;
+
+    if (local_isect->num_hits > max_hits) {
+      hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
+      if (hit >= max_hits) {
+        return optixIgnoreIntersection();
+      }
+    }
+  }
+  else {
+    if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
+      /* Record closest intersection only.
+       * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
+       */
+      return optixIgnoreIntersection();
+    }
+
+    local_isect->num_hits = 1;
+  }
+
+  Intersection *isect = &local_isect->hits[hit];
+  isect->t = optixGetRayTmax();
+  isect->prim = prim;
+  isect->object = get_object_id();
+  isect->type = kernel_data_fetch(objects, isect->object).primitive_type;
+
+  const float2 barycentrics = optixGetTriangleBarycentrics();
+  isect->u = barycentrics.x;
+  isect->v = barycentrics.y;
+
+  /* Record geometric normal. */
+  const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w;
+  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0);
+  const float3 tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1);
+  const float3 tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
+  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
+
+  /* Continue tracing (without this the trace call would return after the first hit). */
+  optixIgnoreIntersection();
+#endif
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
+{
+#ifdef __SHADOW_RECORD_ALL__
+  int prim = optixGetPrimitiveIndex();
+  const uint object = get_object_id();
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#  endif
+
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self_shadow(ray->self, object, prim)) {
+    return optixIgnoreIntersection();
+  }
+
+  float u = 0.0f, v = 0.0f;
+  int type = 0;
+  if (optixIsTriangleHit()) {
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+    u = barycentrics.x;
+    v = barycentrics.y;
+    type = kernel_data_fetch(objects, object).primitive_type;
+  }
+#  ifdef __HAIR__
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+    u = __uint_as_float(optixGetAttribute_0());
+    v = __uint_as_float(optixGetAttribute_1());
+
+    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
+    type = segment.type;
+    prim = segment.prim;
+
+#    if OPTIX_ABI_VERSION < 55
+    /* Filter out curve end-caps. */
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+#    endif
+  }
+#  endif
+  else {
+    type = kernel_data_fetch(objects, object).primitive_type;
+    u = 0.0f;
+    v = 0.0f;
+  }
+
+#  ifndef __TRANSPARENT_SHADOWS__
+  /* No transparent shadows support compiled in, make opaque. */
+  optixSetPayload_5(true);
+  return optixTerminateRay();
+#  else
+  const uint max_hits = optixGetPayload_3();
+  const uint num_hits_packed = optixGetPayload_2();
+  const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
+  const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
+
+  /* If no transparent shadows, all light is blocked and we can stop immediately. */
+  if (num_hits >= max_hits ||
+      !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
+    optixSetPayload_5(true);
+    return optixTerminateRay();
+  }
+
+  /* Always use baked shadow transparency for curves. */
+  if (type & PRIMITIVE_CURVE) {
+    float throughput = __uint_as_float(optixGetPayload_1());
+    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
+    optixSetPayload_1(__float_as_uint(throughput));
+    optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
+
+    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
+      optixSetPayload_5(true);
+      return optixTerminateRay();
+    }
+    else {
+      /* Continue tracing. */
+      optixIgnoreIntersection();
+      return;
+    }
+  }
+
+  /* Record transparent intersection. */
+  optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
+
+  uint record_index = num_recorded_hits;
+
+  const IntegratorShadowState state = optixGetPayload_0();
+
+  const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+  if (record_index >= max_record_hits) {
+    /* If maximum number of hits reached, find a hit to replace. */
+    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
+    uint max_recorded_hit = 0;
+
+    for (int i = 1; i < max_record_hits; i++) {
+      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
+      if (isect_t > max_recorded_t) {
+        max_recorded_t = isect_t;
+        max_recorded_hit = i;
+      }
+    }
+
+    if (optixGetRayTmax() >= max_recorded_t) {
+      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the
+       * current hit anymore. */
+      return;
+    }
+
+    record_index = max_recorded_hit;
+  }
+
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
+  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
+
+  /* Continue tracing. */
+  optixIgnoreIntersection();
+#  endif /* __TRANSPARENT_SHADOWS__ */
+#endif   /* __SHADOW_RECORD_ALL__ */
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_volume_test()
+{
+#if defined(__HAIR__) || defined(__POINTCLOUD__)
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
+  const uint object = get_object_id();
+#ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#endif
+
+  if ((kernel_data_fetch(object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
+    return optixIgnoreIntersection();
+  }
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+  if (intersection_skip_self(ray->self, object, prim)) {
+    return optixIgnoreIntersection();
+  }
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+{
+#ifdef __HAIR__
+#  if OPTIX_ABI_VERSION < 55
+  if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) {
+    /* Filter out curve end-caps. */
+    const float u = __uint_as_float(optixGetAttribute_0());
+    if (u == 0.0f || u == 1.0f) {
+      return optixIgnoreIntersection();
+    }
+  }
+#  endif
+#endif
+
+  const uint object = get_object_id();
+  const uint visibility = optixGetPayload_4();
+#ifdef __VISIBILITY_FLAG__
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+#endif
+
+  const int prim = optixGetPrimitiveIndex();
+  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
+
+  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    if (intersection_skip_self_shadow(ray->self, object, prim)) {
+      return optixIgnoreIntersection();
+    }
+    else {
+      /* Shadow ray early termination. */
+      return optixTerminateRay();
+    }
+  }
+  else {
+    if (intersection_skip_self(ray->self, object, prim)) {
+      return optixIgnoreIntersection();
+    }
+  }
+}
+
+extern "C" __global__ void __closesthit__kernel_optix_hit()
+{
+  const int object = get_object_id();
+  const int prim = optixGetPrimitiveIndex();
+
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
+  optixSetPayload_4(object);
+
+  if (optixIsTriangleHit()) {
+    const float2 barycentrics = optixGetTriangleBarycentrics();
+    optixSetPayload_1(__float_as_uint(barycentrics.x));
+    optixSetPayload_2(__float_as_uint(barycentrics.y));
+    optixSetPayload_3(prim);
+    optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type);
+  }
+  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
+    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
+    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
+    optixSetPayload_2(optixGetAttribute_1());
+    optixSetPayload_3(segment.prim);
+    optixSetPayload_5(segment.type);
+  }
+  else {
+    optixSetPayload_1(0);
+    optixSetPayload_2(0);
+    optixSetPayload_3(prim);
+    optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type);
+  }
+}
+
+/* Custom primitive intersection functions. */
+
+#ifdef __HAIR__
+ccl_device_inline void optix_intersection_curve(const int prim, const int type)
+{
+  const int object = get_object_id();
+
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  const float3 ray_P = optixGetObjectRayOrigin();
+  const float3 ray_D = optixGetObjectRayDirection();
+  const float ray_tmin = optixGetRayTmin();
+
+#  ifdef __OBJECT_MOTION__
+  const float time = optixGetRayTime();
+#  else
+  const float time = 0.0f;
+#  endif
+
+  Intersection isect;
+  isect.t = optixGetRayTmax();
+
+  if (curve_intersect(NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
+    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
+    optixReportIntersection(isect.t,
+                            type & PRIMITIVE_ALL,
+                            __float_as_int(isect.u),  /* Attribute_0 */
+                            __float_as_int(isect.v)); /* Attribute_1 */
+  }
+}
+
+extern "C" __global__ void __intersection__curve_ribbon()
+{
+  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, optixGetPrimitiveIndex());
+  const int prim = segment.prim;
+  const int type = segment.type;
+  if (type & PRIMITIVE_CURVE_RIBBON) {
+    optix_intersection_curve(prim, type);
+  }
+}
+
+#endif
+
+#ifdef __POINTCLOUD__
+extern "C" __global__ void __intersection__point()
+{
+  const int prim = optixGetPrimitiveIndex();
+  const int object = get_object_id();
+  const int type = kernel_data_fetch(objects, object).primitive_type;
+
+#  ifdef __VISIBILITY_FLAG__
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
+    return;
+  }
+#  endif
+
+  const float3 ray_P = optixGetObjectRayOrigin();
+  const float3 ray_D = optixGetObjectRayDirection();
+  const float ray_tmin = optixGetRayTmin();
+
+#  ifdef __OBJECT_MOTION__
+  const float time = optixGetRayTime();
+#  else
+  const float time = 0.0f;
+#  endif
+
+  Intersection isect;
+  isect.t = optixGetRayTmax();
+
+  if (point_intersect(NULL, &isect, ray_P, ray_D, ray_tmin, isect.t, object, prim, time, type)) {
+    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
+    optixReportIntersection(isect.t, type & PRIMITIVE_ALL);
+  }
+}
+#endif
+
+/* Scene intersection. */
+
+ccl_device_intersect bool scene_intersect(KernelGlobals kg,
+                                          ccl_private const Ray *ray,
+                                          const uint visibility,
+                                          ccl_private Intersection *isect)
+{
+  uint p0 = 0;
+  uint p1 = 0;
+  uint p2 = 0;
+  uint p3 = 0;
+  uint p4 = visibility;
+  uint p5 = PRIMITIVE_NONE;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  uint ray_mask = visibility & 0xFF;
+  uint ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    ray_flags |= OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
+  }
+
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             ray_mask,
+             ray_flags,
+             0, /* SBT offset for PG_HITD */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  isect->t = __uint_as_float(p0);
+  isect->u = __uint_as_float(p1);
+  isect->v = __uint_as_float(p2);
+  isect->prim = p3;
+  isect->object = p4;
+  isect->type = p5;
+
+  return p5 != PRIMITIVE_NONE;
+}
+
+#ifdef __BVH_LOCAL__
+ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
+                                                ccl_private const Ray *ray,
+                                                ccl_private LocalIntersection *local_isect,
+                                                int local_object,
+                                                ccl_private uint *lcg_state,
+                                                int max_hits)
+{
+  uint p0 = pointer_pack_to_uint_0(lcg_state);
+  uint p1 = pointer_pack_to_uint_1(lcg_state);
+  uint p2 = pointer_pack_to_uint_0(local_isect);
+  uint p3 = pointer_pack_to_uint_1(local_isect);
+  uint p4 = local_object;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  /* Is set to zero on miss or if ray is aborted, so can be used as return value. */
+  uint p5 = max_hits;
+
+  if (local_isect) {
+    local_isect->num_hits = 0; /* Initialize hit count to zero. */
+  }
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             0xFF,
+             /* Need to always call into __anyhit__kernel_optix_local_hit. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             2, /* SBT offset for PG_HITL */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  return p5;
+}
+#endif
+
+#ifdef __SHADOW_RECORD_ALL__
+ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
+                                                     IntegratorShadowState state,
+                                                     ccl_private const Ray *ray,
+                                                     uint visibility,
+                                                     uint max_hits,
+                                                     ccl_private uint *num_recorded_hits,
+                                                     ccl_private float *throughput)
+{
+  uint p0 = state;
+  uint p1 = __float_as_uint(1.0f); /* Throughput. */
+  uint p2 = 0;                     /* Number of hits. */
+  uint p3 = max_hits;
+  uint p4 = visibility;
+  uint p5 = false;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             ray_mask,
+             /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             1, /* SBT offset for PG_HITS */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  *num_recorded_hits = uint16_unpack_from_uint_0(p2);
+  *throughput = __uint_as_float(p1);
+
+  return p5;
+}
+#endif
+
+#ifdef __VOLUME__
+ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
+                                                 ccl_private const Ray *ray,
+                                                 ccl_private Intersection *isect,
+                                                 const uint visibility)
+{
+  uint p0 = 0;
+  uint p1 = 0;
+  uint p2 = 0;
+  uint p3 = 0;
+  uint p4 = visibility;
+  uint p5 = PRIMITIVE_NONE;
+  uint p6 = ((uint64_t)ray) & 0xFFFFFFFF;
+  uint p7 = (((uint64_t)ray) >> 32) & 0xFFFFFFFF;
+
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  optixTrace(intersection_ray_valid(ray) ? kernel_data.device_bvh : 0,
+             ray->P,
+             ray->D,
+             ray->tmin,
+             ray->tmax,
+             ray->time,
+             ray_mask,
+             /* Need to always call into __anyhit__kernel_optix_volume_test. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             3, /* SBT offset for PG_HITV */
+             0,
+             0,
+             p0,
+             p1,
+             p2,
+             p3,
+             p4,
+             p5,
+             p6,
+             p7);
+
+  isect->t = __uint_as_float(p0);
+  isect->u = __uint_as_float(p1);
+  isect->v = __uint_as_float(p2);
+  isect->prim = p3;
+  isect->object = p4;
+  isect->type = p5;
+
+  return p5 != PRIMITIVE_NONE;
+}
+#endif
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -8,7 +8,6 @@
 #include <optix.h>

 #define __KERNEL_GPU__
-#define __KERNEL_GPU_RAYTRACING__
 #define __KERNEL_CUDA__ /* OptiX kernels are implicitly CUDA kernels too */
 #define __KERNEL_OPTIX__
 #define CCL_NAMESPACE_BEGIN
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -20,34 +20,6 @@
 #include "kernel/integrator/intersect_volume_stack.h"
 // clang-format on

-#define OPTIX_DEFINE_ABI_VERSION_ONLY
-#include <optix_function_table.h>
-
-template<typename T> ccl_device_forceinline T *get_payload_ptr_0()
-{
-  return pointer_unpack_from_uint<T>(optixGetPayload_0(), optixGetPayload_1());
-}
-template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
-{
-  return pointer_unpack_from_uint<T>(optixGetPayload_2(), optixGetPayload_3());
-}
-
-template<typename T> ccl_device_forceinline T *get_payload_ptr_6()
-{
-  return (T *)(((uint64_t)optixGetPayload_7() << 32) | optixGetPayload_6());
-}
-
-ccl_device_forceinline int get_object_id()
-{
-#ifdef __OBJECT_MOTION__
-  /* Always get the instance ID from the TLAS
-   * There might be a motion transform node between TLAS and BLAS which does not have one. */
-  return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
-#else
-  return optixGetInstanceId();
-#endif
-}
-
 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
 {
  const int global_index = optixGetLaunchIndex().x;
@@ -84,411 +56,3 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_st
  integrator_intersect_volume_stack(nullptr, path_index);
 }

-extern "C" __global__ void __miss__kernel_optix_miss()
-{
-  /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
-  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
-  optixSetPayload_5(PRIMITIVE_NONE);
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_local_hit()
-{
-#if defined(__HAIR__) || defined(__POINTCLOUD__)
-  if (!optixIsTriangleHit()) {
-    /* Ignore curves and points. */
-    return optixIgnoreIntersection();
-  }
-#endif
-
-#ifdef __BVH_LOCAL__
-  const int object = get_object_id();
-  if (object != optixGetPayload_4() /* local_object */) {
-    /* Only intersect with matching object. */
-    return optixIgnoreIntersection();
-  }
-
-  const int prim = optixGetPrimitiveIndex();
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-  if (intersection_skip_self_local(ray->self, prim)) {
-    return optixIgnoreIntersection();
-  }
-
-  const uint max_hits = optixGetPayload_5();
-  if (max_hits == 0) {
-    /* Special case for when no hit information is requested, just report that something was hit */
-    optixSetPayload_5(true);
-    return optixTerminateRay();
-  }
-
-  int hit = 0;
-  uint *const lcg_state = get_payload_ptr_0<uint>();
-  LocalIntersection *const local_isect = get_payload_ptr_2<LocalIntersection>();
-
-  if (lcg_state) {
-    for (int i = min(max_hits, local_isect->num_hits) - 1; i >= 0; --i) {
-      if (optixGetRayTmax() == local_isect->hits[i].t) {
-        return optixIgnoreIntersection();
-      }
-    }
-
-    hit = local_isect->num_hits++;
-
-    if (local_isect->num_hits > max_hits) {
-      hit = lcg_step_uint(lcg_state) % local_isect->num_hits;
-      if (hit >= max_hits) {
-        return optixIgnoreIntersection();
-      }
-    }
-  }
-  else {
-    if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
-      /* Record closest intersection only.
-       * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
-       */
-      return optixIgnoreIntersection();
-    }
-
-    local_isect->num_hits = 1;
-  }
-
-  Intersection *isect = &local_isect->hits[hit];
-  isect->t = optixGetRayTmax();
-  isect->prim = prim;
-  isect->object = get_object_id();
-  isect->type = kernel_data_fetch(objects, isect->object).primitive_type;
-
-  const float2 barycentrics = optixGetTriangleBarycentrics();
-  isect->u = 1.0f - barycentrics.y - barycentrics.x;
-  isect->v = barycentrics.x;
-
-  /* Record geometric normal. */
-  const uint tri_vindex = kernel_data_fetch(tri_vindex, prim).w;
-  const float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0);
-  const float3 tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1);
-  const float3 tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
-  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
-
-  /* Continue tracing (without this the trace call would return after the first hit). */
-  optixIgnoreIntersection();
-#endif
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
-{
-#ifdef __SHADOW_RECORD_ALL__
-  int prim = optixGetPrimitiveIndex();
-  const uint object = get_object_id();
-#  ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
-    return optixIgnoreIntersection();
-  }
-#  endif
-
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-  if (intersection_skip_self_shadow(ray->self, object, prim)) {
-    return optixIgnoreIntersection();
-  }
-
-  float u = 0.0f, v = 0.0f;
-  int type = 0;
-  if (optixIsTriangleHit()) {
-    const float2 barycentrics = optixGetTriangleBarycentrics();
-    u = 1.0f - barycentrics.y - barycentrics.x;
-    v = barycentrics.x;
-    type = kernel_data_fetch(objects, object).primitive_type;
-  }
-#  ifdef __HAIR__
-  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
-    u = __uint_as_float(optixGetAttribute_0());
-    v = __uint_as_float(optixGetAttribute_1());
-
-    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
-    type = segment.type;
-    prim = segment.prim;
-
-#    if OPTIX_ABI_VERSION < 55
-    /* Filter out curve endcaps. */
-    if (u == 0.0f || u == 1.0f) {
-      return optixIgnoreIntersection();
-    }
-#    endif
-  }
-#  endif
-  else {
-    type = kernel_data_fetch(objects, object).primitive_type;
-    u = 0.0f;
-    v = 0.0f;
-  }
-
-#  ifndef __TRANSPARENT_SHADOWS__
-  /* No transparent shadows support compiled in, make opaque. */
-  optixSetPayload_5(true);
-  return optixTerminateRay();
-#  else
-  const uint max_hits = optixGetPayload_3();
-  const uint num_hits_packed = optixGetPayload_2();
-  const uint num_recorded_hits = uint16_unpack_from_uint_0(num_hits_packed);
-  const uint num_hits = uint16_unpack_from_uint_1(num_hits_packed);
-
-  /* If no transparent shadows, all light is blocked and we can stop immediately. */
-  if (num_hits >= max_hits ||
-      !(intersection_get_shader_flags(NULL, prim, type) & SD_HAS_TRANSPARENT_SHADOW)) {
-    optixSetPayload_5(true);
-    return optixTerminateRay();
-  }
-
-  /* Always use baked shadow transparency for curves. */
-  if (type & PRIMITIVE_CURVE) {
-    float throughput = __uint_as_float(optixGetPayload_1());
-    throughput *= intersection_curve_shadow_transparency(nullptr, object, prim, u);
-    optixSetPayload_1(__float_as_uint(throughput));
-    optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits, num_hits + 1));
-
-    if (throughput < CURVE_SHADOW_TRANSPARENCY_CUTOFF) {
-      optixSetPayload_5(true);
-      return optixTerminateRay();
-    }
-    else {
-      /* Continue tracing. */
-      optixIgnoreIntersection();
-      return;
-    }
-  }
-
-  /* Record transparent intersection. */
-  optixSetPayload_2(uint16_pack_to_uint(num_recorded_hits + 1, num_hits + 1));
-
-  uint record_index = num_recorded_hits;
-
-  const IntegratorShadowState state = optixGetPayload_0();
-
-  const uint max_record_hits = min(max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
-  if (record_index >= max_record_hits) {
-    /* If maximum number of hits reached, find a hit to replace. */
-    float max_recorded_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, 0, t);
-    uint max_recorded_hit = 0;
-
-    for (int i = 1; i < max_record_hits; i++) {
-      const float isect_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, i, t);
-      if (isect_t > max_recorded_t) {
-        max_recorded_t = isect_t;
-        max_recorded_hit = i;
-      }
-    }
-
-    if (optixGetRayTmax() >= max_recorded_t) {
-      /* Accept hit, so that OptiX won't consider any more hits beyond the distance of the
-       * current hit anymore. */
-      return;
-    }
-
-    record_index = max_recorded_hit;
-  }
-
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, u) = u;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, v) = v;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, t) = optixGetRayTmax();
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, prim) = prim;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, object) = object;
-  INTEGRATOR_STATE_ARRAY_WRITE(state, shadow_isect, record_index, type) = type;
-
-  /* Continue tracing. */
-  optixIgnoreIntersection();
-#  endif /* __TRANSPARENT_SHADOWS__ */
-#endif   /* __SHADOW_RECORD_ALL__ */
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_volume_test()
-{
-#if defined(__HAIR__) || defined(__POINTCLOUD__)
-  if (!optixIsTriangleHit()) {
-    /* Ignore curves. */
-    return optixIgnoreIntersection();
-  }
-#endif
-
-  const uint object = get_object_id();
-#ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
-    return optixIgnoreIntersection();
-  }
-#endif
-
-  if ((kernel_data_fetch(object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
-    return optixIgnoreIntersection();
-  }
-
-  const int prim = optixGetPrimitiveIndex();
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-  if (intersection_skip_self(ray->self, object, prim)) {
-    return optixIgnoreIntersection();
-  }
-}
-
-extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
-{
-#ifdef __HAIR__
-#  if OPTIX_ABI_VERSION < 55
-  if (optixGetPrimitiveType() == OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE) {
-    /* Filter out curve endcaps. */
-    const float u = __uint_as_float(optixGetAttribute_0());
-    if (u == 0.0f || u == 1.0f) {
-      return optixIgnoreIntersection();
-    }
-  }
-#  endif
-#endif
-
-  const uint object = get_object_id();
-  const uint visibility = optixGetPayload_4();
-#ifdef __VISIBILITY_FLAG__
-  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
-    return optixIgnoreIntersection();
-  }
-#endif
-
-  const int prim = optixGetPrimitiveIndex();
-  ccl_private Ray *const ray = get_payload_ptr_6<Ray>();
-
-  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
-    if (intersection_skip_self_shadow(ray->self, object, prim)) {
-      return optixIgnoreIntersection();
-    }
-    else {
-      /* Shadow ray early termination. */
-      return optixTerminateRay();
-    }
-  }
-  else {
-    if (intersection_skip_self(ray->self, object, prim)) {
-      return optixIgnoreIntersection();
-    }
-  }
-}
-
-extern "C" __global__ void __closesthit__kernel_optix_hit()
-{
-  const int object = get_object_id();
-  const int prim = optixGetPrimitiveIndex();
-
-  optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
-  optixSetPayload_4(object);
-
-  if (optixIsTriangleHit()) {
-    const float2 barycentrics = optixGetTriangleBarycentrics();
-    optixSetPayload_1(__float_as_uint(1.0f - barycentrics.y - barycentrics.x));
-    optixSetPayload_2(__float_as_uint(barycentrics.x));
-    optixSetPayload_3(prim);
-    optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type);
-  }
-  else if ((optixGetHitKind() & (~PRIMITIVE_MOTION)) != PRIMITIVE_POINT) {
-    const KernelCurveSegment segment = kernel_data_fetch(curve_segments, prim);
-    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
-    optixSetPayload_2(optixGetAttribute_1());
-    optixSetPayload_3(segment.prim);
-    optixSetPayload_5(segment.type);
-  }
-  else {
-    optixSetPayload_1(0);
-    optixSetPayload_2(0);
-    optixSetPayload_3(prim);
-    optixSetPayload_5(kernel_data_fetch(objects, object).primitive_type);
-  }
-}
-
-#ifdef __HAIR__
-ccl_device_inline void optix_intersection_curve(const int prim, const int type)
-{
-  const int object = get_object_id();
-
-#  ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
-    return;
-  }
-#  endif
-
-  float3 P = optixGetObjectRayOrigin();
-  float3 dir = optixGetObjectRayDirection();
-  float tmin = optixGetRayTmin();
-
-  /* The direction is not normalized by default, but the curve intersection routine expects that */
-  float len;
-  dir = normalize_len(dir, &len);
-
-#  ifdef __OBJECT_MOTION__
-  const float time = optixGetRayTime();
-#  else
-  const float time = 0.0f;
-#  endif
-
-  Intersection isect;
-  isect.t = optixGetRayTmax();
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX)
-    isect.t *= len;
-
-  if (curve_intersect(NULL, &isect, P, dir, tmin, isect.t, object, prim, time, type)) {
-    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
-    optixReportIntersection(isect.t / len,
-                            type & PRIMITIVE_ALL,
-                            __float_as_int(isect.u),  /* Attribute_0 */
-                            __float_as_int(isect.v)); /* Attribute_1 */
-  }
-}
-
-extern "C" __global__ void __intersection__curve_ribbon()
-{
-  const KernelCurveSegment segment = kernel_data_fetch(curve_segments, optixGetPrimitiveIndex());
-  const int prim = segment.prim;
-  const int type = segment.type;
-  if (type & PRIMITIVE_CURVE_RIBBON) {
-    optix_intersection_curve(prim, type);
-  }
-}
-
-#endif
-
-#ifdef __POINTCLOUD__
-extern "C" __global__ void __intersection__point()
-{
-  const int prim = optixGetPrimitiveIndex();
-  const int object = get_object_id();
-  const int type = kernel_data_fetch(objects, object).primitive_type;
-
-#  ifdef __VISIBILITY_FLAG__
-  const uint visibility = optixGetPayload_4();
-  if ((kernel_data_fetch(objects, object).visibility & visibility) == 0) {
-    return;
-  }
-#  endif
-
-  float3 P = optixGetObjectRayOrigin();
-  float3 dir = optixGetObjectRayDirection();
-  float tmin = optixGetRayTmin();
-
-  /* The direction is not normalized by default, the point intersection routine expects that. */
-  float len;
-  dir = normalize_len(dir, &len);
-
-#  ifdef __OBJECT_MOTION__
-  const float time = optixGetRayTime();
-#  else
-  const float time = 0.0f;
-#  endif
-
-  Intersection isect;
-  isect.t = optixGetRayTmax();
-  /* Transform maximum distance into object space. */
-  if (isect.t != FLT_MAX) {
-    isect.t *= len;
-  }
-
-  if (point_intersect(NULL, &isect, P, dir, tmin, isect.t, object, prim, time, type)) {
-    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
-    optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL);
-  }
-}
-#endif
--- a/intern/cycles/kernel/geom/curve_intersect.h
+++ b/intern/cycles/kernel/geom/curve_intersect.h
@@ -72,7 +72,7 @@ ccl_device_inline float sqr_point_to_line_distance(const float3 PmQ0, const floa
 ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
                                          const float3 cylinder_end,
                                          const float cylinder_radius,
-                                          const float3 ray_dir,
+                                          const float3 ray_D,
                                          ccl_private float2 *t_o,
                                          ccl_private float *u0_o,
                                          ccl_private float3 *Ng0_o,
@@ -82,7 +82,7 @@ ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
  /* Calculate quadratic equation to solve. */
  const float rl = 1.0f / len(cylinder_end - cylinder_start);
  const float3 P0 = cylinder_start, dP = (cylinder_end - cylinder_start) * rl;
-  const float3 O = -P0, dO = ray_dir;
+  const float3 O = -P0, dO = ray_D;

  const float dOdO = dot(dO, dO);
  const float OdO = dot(dO, O);
@@ -123,7 +123,7 @@ ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
  /* Calculates u and Ng for near hit. */
  {
    *u0_o = (t0 * dOz + Oz) * rl;
-    const float3 Pr = t0 * ray_dir;
+    const float3 Pr = t0 * ray_D;
    const float3 Pl = (*u0_o) * (cylinder_end - cylinder_start) + cylinder_start;
    *Ng0_o = Pr - Pl;
  }
@@ -131,7 +131,7 @@ ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
  /* Calculates u and Ng for far hit. */
  {
    *u1_o = (t1 * dOz + Oz) * rl;
-    const float3 Pr = t1 * ray_dir;
+    const float3 Pr = t1 * ray_D;
    const float3 Pl = (*u1_o) * (cylinder_end - cylinder_start) + cylinder_start;
    *Ng1_o = Pr - Pl;
  }
@@ -141,10 +141,10 @@ ccl_device_inline bool cylinder_intersect(const float3 cylinder_start,
  return true;
 }

-ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, const float3 ray_dir)
+ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, const float3 ray_D)
 {
  const float3 O = -P;
-  const float3 D = ray_dir;
+  const float3 D = ray_D;
  const float ON = dot(O, N);
  const float DN = dot(D, N);
  const float min_rcp_input = 1e-18f;
@@ -155,7 +155,7 @@ ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, co
  return make_float2(lower, upper);
 }

-ccl_device bool curve_intersect_iterative(const float3 ray_dir,
+ccl_device bool curve_intersect_iterative(const float3 ray_D,
                                          const float ray_tmin,
                                          ccl_private float *ray_tmax,
                                          const float dt,
@@ -165,7 +165,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
                                          const bool use_backfacing,
                                          ccl_private Intersection *isect)
 {
-  const float length_ray_dir = len(ray_dir);
+  const float length_ray_D = len(ray_D);

  /* Error of curve evaluations is proportional to largest coordinate. */
  const float4 box_min = min(min(curve[0], curve[1]), min(curve[2], curve[3]));
@@ -176,9 +176,9 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
  const float radius_max = box_max.w;

  for (int i = 0; i < CURVE_NUM_JACOBIAN_ITERATIONS; i++) {
-    const float3 Q = ray_dir * t;
-    const float3 dQdt = ray_dir;
-    const float Q_err = 16.0f * FLT_EPSILON * length_ray_dir * t;
+    const float3 Q = ray_D * t;
+    const float3 dQdt = ray_D;
+    const float Q_err = 16.0f * FLT_EPSILON * length_ray_D * t;

    const float4 P4 = catmull_rom_basis_eval(curve, u);
    const float4 dPdu4 = catmull_rom_basis_derivative(curve, u);
@@ -233,7 +233,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
      const float3 U = dradiusdu * R + dPdu;
      const float3 V = cross(dPdu, R);
      const float3 Ng = cross(V, U);
-      if (!use_backfacing && dot(ray_dir, Ng) > 0.0f) {
+      if (!use_backfacing && dot(ray_D, Ng) > 0.0f) {
        return false;
      }

@@ -249,8 +249,8 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
  return false;
 }

-ccl_device bool curve_intersect_recursive(const float3 ray_orig,
-                                          const float3 ray_dir,
+ccl_device bool curve_intersect_recursive(const float3 ray_P,
+                                          const float3 ray_D,
                                          const float ray_tmin,
                                          float ray_tmax,
                                          float4 curve[4],
@@ -258,8 +258,8 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
 {
  /* Move ray closer to make intersection stable. */
  const float3 center = float4_to_float3(0.25f * (curve[0] + curve[1] + curve[2] + curve[3]));
-  const float dt = dot(center - ray_orig, ray_dir) / dot(ray_dir, ray_dir);
-  const float3 ref = ray_orig + ray_dir * dt;
+  const float dt = dot(center - ray_P, ray_D) / dot(ray_D, ray_D);
+  const float3 ref = ray_P + ray_D * dt;
  const float4 ref4 = make_float4(ref.x, ref.y, ref.z, 0.0f);
  curve[0] -= ref4;
  curve[1] -= ref4;
@@ -322,7 +322,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
      valid = cylinder_intersect(float4_to_float3(P0),
                                 float4_to_float3(P3),
                                 r_outer,
-                                 ray_dir,
+                                 ray_D,
                                 &tc_outer,
                                 &u_outer0,
                                 &Ng_outer0,
@@ -335,11 +335,10 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
      /* Intersect with cap-planes. */
      float2 tp = make_float2(ray_tmin - dt, ray_tmax - dt);
      tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y));
-      const float2 h0 = half_plane_intersect(
-          float4_to_float3(P0), float4_to_float3(dP0du), ray_dir);
+      const float2 h0 = half_plane_intersect(float4_to_float3(P0), float4_to_float3(dP0du), ray_D);
      tp = make_float2(max(tp.x, h0.x), min(tp.y, h0.y));
      const float2 h1 = half_plane_intersect(
-          float4_to_float3(P3), -float4_to_float3(dP3du), ray_dir);
+          float4_to_float3(P3), -float4_to_float3(dP3du), ray_D);
      tp = make_float2(max(tp.x, h1.x), min(tp.y, h1.y));
      valid = tp.x <= tp.y;
      if (!valid) {
@@ -359,7 +358,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
      const bool valid_inner = cylinder_intersect(float4_to_float3(P0),
                                                  float4_to_float3(P3),
                                                  r_inner,
-                                                  ray_dir,
+                                                  ray_D,
                                                  &tc_inner,
                                                  &u_inner0,
                                                  &Ng_inner0,
@@ -369,9 +368,9 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
      /* At the unstable area we subdivide deeper. */
 #  if 0
      const bool unstable0 = (!valid_inner) |
-                             (fabsf(dot(normalize(ray_dir), normalize(Ng_inner0))) < 0.3f);
+                             (fabsf(dot(normalize(ray_D), normalize(Ng_inner0))) < 0.3f);
      const bool unstable1 = (!valid_inner) |
-                             (fabsf(dot(normalize(ray_dir), normalize(Ng_inner1))) < 0.3f);
+                             (fabsf(dot(normalize(ray_D), normalize(Ng_inner1))) < 0.3f);
 #  else
      /* On the GPU appears to be a little faster if always enabled. */
      (void)valid_inner;
@@ -396,7 +395,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                          CURVE_NUM_BEZIER_SUBDIVISIONS;
        if (depth >= termDepth) {
          found |= curve_intersect_iterative(
-              ray_dir, ray_tmin, &ray_tmax, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
+              ray_D, ray_tmin, &ray_tmax, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
        }
        else {
          recurse = true;
@@ -409,7 +408,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                          CURVE_NUM_BEZIER_SUBDIVISIONS;
        if (depth >= termDepth) {
          found |= curve_intersect_iterative(
-              ray_dir, ray_tmin, &ray_tmax, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
+              ray_D, ray_tmin, &ray_tmax, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
        }
        else {
          recurse = true;
@@ -519,13 +518,16 @@ ccl_device_inline bool ribbon_intersect_quad(const float ray_tmin,
  return true;
 }

-ccl_device_inline void ribbon_ray_space(const float3 ray_dir, float3 ray_space[3])
+ccl_device_inline void ribbon_ray_space(const float3 ray_D,
+                                        const float ray_D_invlen,
+                                        float3 ray_space[3])
 {
-  const float3 dx0 = make_float3(0, ray_dir.z, -ray_dir.y);
-  const float3 dx1 = make_float3(-ray_dir.z, 0, ray_dir.x);
+  const float3 D = ray_D * ray_D_invlen;
+  const float3 dx0 = make_float3(0, D.z, -D.y);
+  const float3 dx1 = make_float3(-D.z, 0, D.x);
  ray_space[0] = normalize(dot(dx0, dx0) > dot(dx1, dx1) ? dx0 : dx1);
-  ray_space[1] = normalize(cross(ray_dir, ray_space[0]));
-  ray_space[2] = ray_dir;
+  ray_space[1] = normalize(cross(D, ray_space[0]));
+  ray_space[2] = D * ray_D_invlen;
 }

 ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
@@ -537,7 +539,7 @@ ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],
 }

 ccl_device_inline bool ribbon_intersect(const float3 ray_org,
-                                        const float3 ray_dir,
+                                        const float3 ray_D,
                                        const float ray_tmin,
                                        float ray_tmax,
                                        const int N,
@@ -545,8 +547,9 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
                                        ccl_private Intersection *isect)
 {
  /* Transform control points into ray space. */
+  const float ray_D_invlen = 1.0f / len(ray_D);
  float3 ray_space[3];
-  ribbon_ray_space(ray_dir, ray_space);
+  ribbon_ray_space(ray_D, ray_D_invlen, ray_space);

  curve[0] = ribbon_to_ray_space(ray_space, ray_org, curve[0]);
  curve[1] = ribbon_to_ray_space(ray_space, ray_org, curve[1]);
@@ -594,7 +597,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
        const float avoidance_factor = 2.0f;
        if (avoidance_factor != 0.0f) {
          float r = mix(p0.w, p1.w, vu);
-          valid0 = vt > avoidance_factor * r;
+          valid0 = vt > avoidance_factor * r * ray_D_invlen;
        }

        if (valid0) {
@@ -619,8 +622,8 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,

 ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
                                            ccl_private Intersection *isect,
-                                            const float3 P,
-                                            const float3 dir,
+                                            const float3 ray_P,
+                                            const float3 ray_D,
                                            const float tmin,
                                            const float tmax,
                                            int object,
@@ -651,7 +654,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
  if (type & PRIMITIVE_CURVE_RIBBON) {
    /* todo: adaptive number of subdivisions could help performance here. */
    const int subdivisions = kernel_data.bvh.curve_subdivisions;
-    if (ribbon_intersect(P, dir, tmin, tmax, subdivisions, curve, isect)) {
+    if (ribbon_intersect(ray_P, ray_D, tmin, tmax, subdivisions, curve, isect)) {
      isect->prim = prim;
      isect->object = object;
      isect->type = type;
@@ -661,7 +664,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
    return false;
  }
  else {
-    if (curve_intersect_recursive(P, dir, tmin, tmax, curve, isect)) {
+    if (curve_intersect_recursive(ray_P, ray_D, tmin, tmax, curve, isect)) {
      isect->prim = prim;
      isect->object = object;
      isect->type = type;
--- a/intern/cycles/kernel/geom/motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/motion_triangle_intersect.h
@@ -27,8 +27,8 @@ ccl_device_inline float3 motion_triangle_point_from_uv(KernelGlobals kg,
                                                       const float v,
                                                       float3 verts[3])
 {
-  float w = 1.0f - u - v;
-  float3 P = u * verts[0] + v * verts[1] + w * verts[2];
+  /* This appears to give slightly better precision than interpolating with w = (1 - u - v). */
+  float3 P = verts[0] + u * (verts[1] - verts[0]) + v * (verts[2] - verts[0]);

  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
    const Transform tfm = object_get_transform(kg, sd);
--- a/intern/cycles/kernel/geom/object.h
+++ b/intern/cycles/kernel/geom/object.h
@@ -86,7 +86,7 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals kg,
    Transform tfm = object_fetch_transform_motion(kg, object, time);

    if (itfm)
-      *itfm = transform_quick_inverse(tfm);
+      *itfm = transform_inverse(tfm);

    return tfm;
  }
@@ -488,127 +488,54 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)

 /* Transform ray into object space to enter static object in BVH */

-ccl_device_inline float bvh_instance_push(KernelGlobals kg,
-                                          int object,
-                                          ccl_private const Ray *ray,
-                                          ccl_private float3 *P,
-                                          ccl_private float3 *dir,
-                                          ccl_private float3 *idir)
+ccl_device_inline void bvh_instance_push(KernelGlobals kg,
+                                         int object,
+                                         ccl_private const Ray *ray,
+                                         ccl_private float3 *P,
+                                         ccl_private float3 *dir,
+                                         ccl_private float3 *idir)
 {
  Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);

  *P = transform_point(&tfm, ray->P);

-  float len;
-  *dir = bvh_clamp_direction(normalize_len(transform_direction(&tfm, ray->D), &len));
-  *idir = bvh_inverse_direction(*dir);
-
-  return len;
-}
-
-/* Transform ray to exit static object in BVH. */
-
-ccl_device_inline float bvh_instance_pop(KernelGlobals kg,
-                                         int object,
-                                         ccl_private const Ray *ray,
-                                         ccl_private float3 *P,
-                                         ccl_private float3 *dir,
-                                         ccl_private float3 *idir,
-                                         float t)
-{
-  if (t != FLT_MAX) {
-    Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-    t /= len(transform_direction(&tfm, ray->D));
-  }
-
-  *P = ray->P;
-  *dir = bvh_clamp_direction(ray->D);
-  *idir = bvh_inverse_direction(*dir);
-
-  return t;
-}
-
-/* Same as above, but returns scale factor to apply to multiple intersection distances */
-
-ccl_device_inline void bvh_instance_pop_factor(KernelGlobals kg,
-                                               int object,
-                                               ccl_private const Ray *ray,
-                                               ccl_private float3 *P,
-                                               ccl_private float3 *dir,
-                                               ccl_private float3 *idir,
-                                               ccl_private float *t_fac)
-{
-  Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
-  *t_fac = 1.0f / len(transform_direction(&tfm, ray->D));
-
-  *P = ray->P;
-  *dir = bvh_clamp_direction(ray->D);
+  *dir = bvh_clamp_direction(transform_direction(&tfm, ray->D));
  *idir = bvh_inverse_direction(*dir);
 }

 #ifdef __OBJECT_MOTION__
 /* Transform ray into object space to enter motion blurred object in BVH */

-ccl_device_inline float bvh_instance_motion_push(KernelGlobals kg,
-                                                 int object,
-                                                 ccl_private const Ray *ray,
-                                                 ccl_private float3 *P,
-                                                 ccl_private float3 *dir,
-                                                 ccl_private float3 *idir,
-                                                 ccl_private Transform *itfm)
-{
-  object_fetch_transform_motion_test(kg, object, ray->time, itfm);
-
-  *P = transform_point(itfm, ray->P);
-
-  float len;
-  *dir = bvh_clamp_direction(normalize_len(transform_direction(itfm, ray->D), &len));
-  *idir = bvh_inverse_direction(*dir);
-
-  return len;
-}
-
-/* Transform ray to exit motion blurred object in BVH. */
-
-ccl_device_inline float bvh_instance_motion_pop(KernelGlobals kg,
+ccl_device_inline void bvh_instance_motion_push(KernelGlobals kg,
                                                int object,
                                                ccl_private const Ray *ray,
                                                ccl_private float3 *P,
                                                ccl_private float3 *dir,
-                                                ccl_private float3 *idir,
-                                                float t,
-                                                ccl_private Transform *itfm)
+                                                ccl_private float3 *idir)
 {
-  if (t != FLT_MAX) {
-    t /= len(transform_direction(itfm, ray->D));
-  }
+  Transform tfm;
+  object_fetch_transform_motion_test(kg, object, ray->time, &tfm);

-  *P = ray->P;
-  *dir = bvh_clamp_direction(ray->D);
-  *idir = bvh_inverse_direction(*dir);
+  *P = transform_point(&tfm, ray->P);

-  return t;
-}
-
-/* Same as above, but returns scale factor to apply to multiple intersection distances */
-
-ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals kg,
-                                                      int object,
-                                                      ccl_private const Ray *ray,
-                                                      ccl_private float3 *P,
-                                                      ccl_private float3 *dir,
-                                                      ccl_private float3 *idir,
-                                                      ccl_private float *t_fac,
-                                                      ccl_private Transform *itfm)
-{
-  *t_fac = 1.0f / len(transform_direction(itfm, ray->D));
-  *P = ray->P;
-  *dir = bvh_clamp_direction(ray->D);
+  *dir = bvh_clamp_direction(transform_direction(&tfm, ray->D));
  *idir = bvh_inverse_direction(*dir);
 }

 #endif

+/* Transform ray to exit static object in BVH. */
+
+ccl_device_inline void bvh_instance_pop(ccl_private const Ray *ray,
+                                        ccl_private float3 *P,
+                                        ccl_private float3 *dir,
+                                        ccl_private float3 *idir)
+{
+  *P = ray->P;
+  *dir = bvh_clamp_direction(ray->D);
+  *idir = bvh_inverse_direction(*dir);
+}
+
 /* TODO: This can be removed when we know if no devices will require explicit
 * address space qualifiers for this case. */

--- a/intern/cycles/kernel/geom/point_intersect.h
+++ b/intern/cycles/kernel/geom/point_intersect.h
@@ -10,20 +10,20 @@ CCL_NAMESPACE_BEGIN
 #ifdef __POINTCLOUD__

 ccl_device_forceinline bool point_intersect_test(const float4 point,
-                                                 const float3 P,
-                                                 const float3 dir,
-                                                 const float tmin,
-                                                 const float tmax,
+                                                 const float3 ray_P,
+                                                 const float3 ray_D,
+                                                 const float ray_tmin,
+                                                 const float ray_tmax,
                                                 ccl_private float *t)
 {
  const float3 center = float4_to_float3(point);
  const float radius = point.w;

-  const float rd2 = 1.0f / dot(dir, dir);
+  const float rd2 = 1.0f / dot(ray_D, ray_D);

-  const float3 c0 = center - P;
-  const float projC0 = dot(c0, dir) * rd2;
-  const float3 perp = c0 - projC0 * dir;
+  const float3 c0 = center - ray_P;
+  const float projC0 = dot(c0, ray_D) * rd2;
+  const float3 perp = c0 - projC0 * ray_D;
  const float l2 = dot(perp, perp);
  const float r2 = radius * radius;
  if (!(l2 <= r2)) {
@@ -32,12 +32,12 @@ ccl_device_forceinline bool point_intersect_test(const float4 point,

  const float td = sqrt((r2 - l2) * rd2);
  const float t_front = projC0 - td;
-  const bool valid_front = (tmin <= t_front) & (t_front <= tmax);
+  const bool valid_front = (ray_tmin <= t_front) & (t_front <= ray_tmax);

  /* Always back-face culling for now. */
 #  if 0
  const float t_back = projC0 + td;
-  const bool valid_back = (tmin <= t_back) & (t_back <= tmax);
+  const bool valid_back = (ray_tmin <= t_back) & (t_back <= ray_tmax);

  /* check if there is a first hit */
  const bool valid_first = valid_front | valid_back;
@@ -58,10 +58,10 @@ ccl_device_forceinline bool point_intersect_test(const float4 point,

 ccl_device_forceinline bool point_intersect(KernelGlobals kg,
                                            ccl_private Intersection *isect,
-                                            const float3 P,
-                                            const float3 dir,
-                                            const float tmin,
-                                            const float tmax,
+                                            const float3 ray_P,
+                                            const float3 ray_D,
+                                            const float ray_tmin,
+                                            const float ray_tmax,
                                            const int object,
                                            const int prim,
                                            const float time,
@@ -70,7 +70,7 @@ ccl_device_forceinline bool point_intersect(KernelGlobals kg,
  const float4 point = (type & PRIMITIVE_MOTION) ? motion_point(kg, object, prim, time) :
                                                   kernel_data_fetch(points, prim);

-  if (!point_intersect_test(point, P, dir, tmin, tmax, &isect->t)) {
+  if (!point_intersect_test(point, ray_P, ray_D, ray_tmin, ray_tmax, &isect->t)) {
    return false;
  }

--- a/intern/cycles/kernel/geom/shader_data.h
+++ b/intern/cycles/kernel/geom/shader_data.h
@@ -18,7 +18,7 @@ ccl_device void shader_setup_object_transforms(KernelGlobals kg,
 {
  if (sd->object_flag & SD_OBJECT_MOTION) {
    sd->ob_tfm_motion = object_fetch_transform_motion(kg, sd->object, time);
-    sd->ob_itfm_motion = transform_quick_inverse(sd->ob_tfm_motion);
+    sd->ob_itfm_motion = transform_inverse(sd->ob_tfm_motion);
  }
 }
 #endif
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@@ -94,11 +94,11 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals kg,
    float2 uv[3];
    subd_triangle_patch_uv(kg, sd, uv);

-    float2 dpdu = uv[0] - uv[2];
-    float2 dpdv = uv[1] - uv[2];
+    float2 dpdu = uv[1] - uv[0];
+    float2 dpdv = uv[2] - uv[0];

    /* p is [s, t] */
-    float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
+    float2 p = dpdu * sd->u + dpdv * sd->v + uv[0];

    float a, dads, dadt;
    a = patch_eval_float(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
@@ -165,12 +165,12 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
    if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif

-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
  }
  else if (desc.element == ATTR_ELEMENT_CORNER) {
    float2 uv[3];
@@ -195,12 +195,12 @@ ccl_device_noinline float subd_triangle_attribute_float(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
    if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif

-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
  }
  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
    if (dx)
@@ -233,11 +233,11 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals kg,
    float2 uv[3];
    subd_triangle_patch_uv(kg, sd, uv);

-    float2 dpdu = uv[0] - uv[2];
-    float2 dpdv = uv[1] - uv[2];
+    float2 dpdu = uv[1] - uv[0];
+    float2 dpdv = uv[2] - uv[0];

    /* p is [s, t] */
-    float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
+    float2 p = dpdu * sd->u + dpdv * sd->v + uv[0];

    float2 a, dads, dadt;

@@ -305,12 +305,12 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
    if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif

-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
  }
  else if (desc.element == ATTR_ELEMENT_CORNER) {
    float2 uv[3];
@@ -337,12 +337,12 @@ ccl_device_noinline float2 subd_triangle_attribute_float2(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
    if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif

-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
  }
  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
    if (dx)
@@ -375,11 +375,11 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
    float2 uv[3];
    subd_triangle_patch_uv(kg, sd, uv);

-    float2 dpdu = uv[0] - uv[2];
-    float2 dpdv = uv[1] - uv[2];
+    float2 dpdu = uv[1] - uv[0];
+    float2 dpdv = uv[2] - uv[0];

    /* p is [s, t] */
-    float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
+    float2 p = dpdu * sd->u + dpdv * sd->v + uv[0];

    float3 a, dads, dadt;
    a = patch_eval_float3(kg, sd, desc.offset, patch, p.x, p.y, 0, &dads, &dadt);
@@ -446,12 +446,12 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
    if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif

-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
  }
  else if (desc.element == ATTR_ELEMENT_CORNER) {
    float2 uv[3];
@@ -478,12 +478,12 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
    if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif

-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
  }
  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
    if (dx)
@@ -516,11 +516,11 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
    float2 uv[3];
    subd_triangle_patch_uv(kg, sd, uv);

-    float2 dpdu = uv[0] - uv[2];
-    float2 dpdv = uv[1] - uv[2];
+    float2 dpdu = uv[1] - uv[0];
+    float2 dpdv = uv[2] - uv[0];

    /* p is [s, t] */
-    float2 p = dpdu * sd->u + dpdv * sd->v + uv[2];
+    float2 p = dpdu * sd->u + dpdv * sd->v + uv[0];

    float4 a, dads, dadt;
    if (desc.type == NODE_ATTR_RGBA) {
@@ -592,12 +592,12 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
    if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif

-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
  }
  else if (desc.element == ATTR_ELEMENT_CORNER || desc.element == ATTR_ELEMENT_CORNER_BYTE) {
    float2 uv[3];
@@ -636,12 +636,12 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * a + sd->dv.dx * b - (sd->du.dx + sd->dv.dx) * c;
+      *dx = sd->du.dx * b + sd->dv.dx * c - (sd->du.dx + sd->dv.dx) * a;
    if (dy)
-      *dy = sd->du.dy * a + sd->dv.dy * b - (sd->du.dy + sd->dv.dy) * c;
+      *dy = sd->du.dy * b + sd->dv.dy * c - (sd->du.dy + sd->dv.dy) * a;
 #endif

-    return sd->u * a + sd->v * b + (1.0f - sd->u - sd->v) * c;
+    return sd->u * b + sd->v * c + (1.0f - sd->u - sd->v) * a;
  }
  else if (desc.element == ATTR_ELEMENT_OBJECT || desc.element == ATTR_ELEMENT_MESH) {
    if (dx)
--- a/intern/cycles/kernel/geom/triangle.h
+++ b/intern/cycles/kernel/geom/triangle.h
@@ -45,8 +45,8 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
  float3 v1 = kernel_data_fetch(tri_verts, tri_vindex.w + 1);
  float3 v2 = kernel_data_fetch(tri_verts, tri_vindex.w + 2);
  /* compute point */
-  float t = 1.0f - u - v;
-  *P = (u * v0 + v * v1 + t * v2);
+  float w = 1.0f - u - v;
+  *P = (w * v0 + u * v1 + v * v2);
  /* get object flags */
  int object_flag = kernel_data_fetch(object_flag, object);
  /* compute normal */
@@ -97,7 +97,7 @@ triangle_smooth_normal(KernelGlobals kg, float3 Ng, int prim, float u, float v)
  float3 n1 = kernel_data_fetch(tri_vnormal, tri_vindex.y);
  float3 n2 = kernel_data_fetch(tri_vnormal, tri_vindex.z);

-  float3 N = safe_normalize((1.0f - u - v) * n2 + u * n0 + v * n1);
+  float3 N = safe_normalize((1.0f - u - v) * n0 + u * n1 + v * n2);

  return is_zero(N) ? Ng : N;
 }
@@ -118,7 +118,7 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
    object_inverse_normal_transform(kg, sd, &n2);
  }

-  float3 N = (1.0f - u - v) * n2 + u * n0 + v * n1;
+  float3 N = (1.0f - u - v) * n0 + u * n1 + v * n2;

  return is_zero(N) ? Ng : N;
 }
@@ -137,8 +137,8 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals kg,
  const float3 p2 = kernel_data_fetch(tri_verts, tri_vindex.w + 2);

  /* compute derivatives of P w.r.t. uv */
-  *dPdu = (p0 - p2);
-  *dPdv = (p1 - p2);
+  *dPdu = (p1 - p0);
+  *dPdv = (p2 - p0);
 }

 /* Reading attributes on various triangle elements */
@@ -167,12 +167,12 @@ ccl_device float triangle_attribute_float(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * f0 + sd->dv.dx * f1 - (sd->du.dx + sd->dv.dx) * f2;
+      *dx = sd->du.dx * f1 + sd->dv.dx * f2 - (sd->du.dx + sd->dv.dx) * f0;
    if (dy)
-      *dy = sd->du.dy * f0 + sd->dv.dy * f1 - (sd->du.dy + sd->dv.dy) * f2;
+      *dy = sd->du.dy * f1 + sd->dv.dy * f2 - (sd->du.dy + sd->dv.dy) * f0;
 #endif

-    return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
+    return sd->u * f1 + sd->v * f2 + (1.0f - sd->u - sd->v) * f0;
  }
  else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -217,12 +217,12 @@ ccl_device float2 triangle_attribute_float2(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * f0 + sd->dv.dx * f1 - (sd->du.dx + sd->dv.dx) * f2;
+      *dx = sd->du.dx * f1 + sd->dv.dx * f2 - (sd->du.dx + sd->dv.dx) * f0;
    if (dy)
-      *dy = sd->du.dy * f0 + sd->dv.dy * f1 - (sd->du.dy + sd->dv.dy) * f2;
+      *dy = sd->du.dy * f1 + sd->dv.dy * f2 - (sd->du.dy + sd->dv.dy) * f0;
 #endif

-    return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
+    return sd->u * f1 + sd->v * f2 + (1.0f - sd->u - sd->v) * f0;
  }
  else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -267,12 +267,12 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * f0 + sd->dv.dx * f1 - (sd->du.dx + sd->dv.dx) * f2;
+      *dx = sd->du.dx * f1 + sd->dv.dx * f2 - (sd->du.dx + sd->dv.dx) * f0;
    if (dy)
-      *dy = sd->du.dy * f0 + sd->dv.dy * f1 - (sd->du.dy + sd->dv.dy) * f2;
+      *dy = sd->du.dy * f1 + sd->dv.dy * f2 - (sd->du.dy + sd->dv.dy) * f0;
 #endif

-    return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
+    return sd->u * f1 + sd->v * f2 + (1.0f - sd->u - sd->v) * f0;
  }
  else {
 #ifdef __RAY_DIFFERENTIALS__
@@ -328,12 +328,12 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,

 #ifdef __RAY_DIFFERENTIALS__
    if (dx)
-      *dx = sd->du.dx * f0 + sd->dv.dx * f1 - (sd->du.dx + sd->dv.dx) * f2;
+      *dx = sd->du.dx * f1 + sd->dv.dx * f2 - (sd->du.dx + sd->dv.dx) * f0;
    if (dy)
-      *dy = sd->du.dy * f0 + sd->dv.dy * f1 - (sd->du.dy + sd->dv.dy) * f2;
+      *dy = sd->du.dy * f1 + sd->dv.dy * f2 - (sd->du.dy + sd->dv.dy) * f0;
 #endif

-    return sd->u * f0 + sd->v * f1 + (1.0f - sd->u - sd->v) * f2;
+    return sd->u * f1 + sd->v * f2 + (1.0f - sd->u - sd->v) * f0;
  }
  else {
 #ifdef __RAY_DIFFERENTIALS__
--- a/intern/cycles/kernel/geom/triangle_intersect.h
+++ b/intern/cycles/kernel/geom/triangle_intersect.h
@@ -145,9 +145,9 @@ ccl_device_inline float3 triangle_point_from_uv(KernelGlobals kg,
  const packed_float3 tri_a = kernel_data_fetch(tri_verts, tri_vindex + 0),
                      tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
                      tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
-  float w = 1.0f - u - v;

-  float3 P = u * tri_a + v * tri_b + w * tri_c;
+  /* This appears to give slightly better precision than interpolating with w = (1 - u - v). */
+  float3 P = tri_a + u * (tri_b - tri_a) + v * (tri_c - tri_a);

  if (!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
    const Transform tfm = object_get_transform(kg, sd);
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -155,6 +155,11 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
                                 1.0f - u);
  }

+  /* Convert from Blender to Cycles/Embree/OptiX barycentric convention. */
+  const float tmp = u;
+  u = v;
+  v = 1.0f - tmp - v;
+
  /* Position and normal on triangle. */
  const int object = kernel_data.bake.object_index;
  float3 P, Ng;
--- a/intern/cycles/kernel/integrator/intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/intersect_shadow.h
@@ -51,7 +51,7 @@ ccl_device_forceinline int integrate_shadow_max_transparent_hits(KernelGlobals k
 }

 #ifdef __TRANSPARENT_SHADOWS__
-#  if defined(__KERNEL_CPU__)
+#  ifndef __KERNEL_GPU__
 ccl_device int shadow_intersections_compare(const void *a, const void *b)
 {
  const Intersection *isect_a = (const Intersection *)a;
--- a/intern/cycles/kernel/integrator/intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@@ -38,8 +38,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,

 #ifdef __VOLUME_RECORD_ALL__
  Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
+  uint num_hits = scene_intersect_volume(kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
  if (num_hits > 0) {
    Intersection *isect = hits;

@@ -108,8 +107,7 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s

 #ifdef __VOLUME_RECORD_ALL__
  Intersection hits[2 * MAX_VOLUME_STACK_SIZE + 1];
-  uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
+  uint num_hits = scene_intersect_volume(kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
  if (num_hits > 0) {
    int enclosed_volumes[MAX_VOLUME_STACK_SIZE];
    Intersection *isect = hits;
--- a/intern/cycles/kernel/integrator/mnee.h
+++ b/intern/cycles/kernel/integrator/mnee.h
@@ -186,7 +186,7 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
    triangle_vertices_and_normals(kg, sd_vtx->prim, verts, normals);

    /* Compute refined position (same code as in triangle_point_from_uv). */
-    sd_vtx->P = isect->u * verts[0] + isect->v * verts[1] + (1.f - isect->u - isect->v) * verts[2];
+    sd_vtx->P = (1.f - isect->u - isect->v) * verts[0] + isect->u * verts[1] + isect->v * verts[2];
    if (!(sd_vtx->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
      const Transform tfm = object_get_transform(kg, sd_vtx);
      sd_vtx->P = transform_point(&tfm, sd_vtx->P);
@@ -213,8 +213,8 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
  }

  /* Tangent space (position derivatives) WRT barycentric (u, v). */
-  float3 dp_du = verts[0] - verts[2];
-  float3 dp_dv = verts[1] - verts[2];
+  float3 dp_du = verts[1] - verts[0];
+  float3 dp_dv = verts[2] - verts[0];

  /* Geometric normal. */
  vtx->ng = normalize(cross(dp_du, dp_dv));
@@ -223,16 +223,16 @@ ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,

  /* Shading normals: Interpolate normals between vertices. */
  float n_len;
-  vtx->n = normalize_len(normals[0] * sd_vtx->u + normals[1] * sd_vtx->v +
-                             normals[2] * (1.0f - sd_vtx->u - sd_vtx->v),
+  vtx->n = normalize_len(normals[0] * (1.0f - sd_vtx->u - sd_vtx->v) + normals[1] * sd_vtx->u +
+                             normals[2] * sd_vtx->v,
                         &n_len);

  /* Shading normal derivatives WRT barycentric (u, v)
   * we calculate the derivative of n = |u*n0 + v*n1 + (1-u-v)*n2| using:
   * d/du [f(u)/|f(u)|] = [d/du f(u)]/|f(u)| - f(u)/|f(u)|^3 <f(u), d/du f(u)>. */
  const float inv_n_len = 1.f / n_len;
-  float3 dn_du = inv_n_len * (normals[0] - normals[2]);
-  float3 dn_dv = inv_n_len * (normals[1] - normals[2]);
+  float3 dn_du = inv_n_len * (normals[1] - normals[0]);
+  float3 dn_dv = inv_n_len * (normals[2] - normals[0]);
  dn_du -= vtx->n * dot(vtx->n, dn_du);
  dn_dv -= vtx->n * dot(vtx->n, dn_dv);

--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -13,7 +13,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void path_state_init_queues(IntegratorState state)
 {
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
  INTEGRATOR_STATE_WRITE(&state->shadow, shadow_path, queued_kernel) = 0;
  INTEGRATOR_STATE_WRITE(&state->ao, shadow_path, queued_kernel) = 0;
 #endif
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -140,7 +140,7 @@ typedef struct IntegratorStateGPU {
 * happen from a kernel which operates on a "main" path. Attempt to use shadow catcher accessors
 * from a kernel which operates on a shadow catcher state will cause bad memory access. */

-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__

 /* Scalar access on CPU. */

@@ -159,7 +159,7 @@ typedef const IntegratorShadowStateCPU *ccl_restrict ConstIntegratorShadowState;
 #  define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
    ((state)->nested_struct[array_index].member)

-#else /* __KERNEL_CPU__ */
+#else /* !__KERNEL_GPU__ */

 /* Array access on GPU with Structure-of-Arrays. */

@@ -180,6 +180,6 @@ typedef int ConstIntegratorShadowState;
 #  define INTEGRATOR_STATE_ARRAY_WRITE(state, nested_struct, array_index, member) \
    INTEGRATOR_STATE_ARRAY(state, nested_struct, array_index, member)

-#endif /* __KERNEL_CPU__ */
+#endif /* !__KERNEL_GPU__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/state_util.h
+++ b/intern/cycles/kernel/integrator/state_util.h
@@ -338,7 +338,7 @@ ccl_device_inline IntegratorState integrator_state_shadow_catcher_split(KernelGl
  return to_state;
 }

-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 ccl_device_inline int integrator_state_bounce(ConstIntegratorState state, const int)
 {
  return INTEGRATOR_STATE(state, path, bounce);
--- a/intern/cycles/kernel/integrator/subsurface_disk.h
+++ b/intern/cycles/kernel/integrator/subsurface_disk.h
@@ -126,17 +126,8 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
    if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
      /* Transform normal to world space. */
      Transform itfm;
-      Transform tfm = object_fetch_transform_motion_test(kg, object, time, &itfm);
+      object_fetch_transform_motion_test(kg, object, time, &itfm);
      hit_Ng = normalize(transform_direction_transposed(&itfm, hit_Ng));
-
-      /* Transform t to world space, except for OptiX and MetalRT where it already is. */
-#ifdef __KERNEL_GPU_RAYTRACING__
-      (void)tfm;
-#else
-      float3 D = transform_direction(&itfm, ray.D);
-      D = normalize(D) * ss_isect.hits[hit].t;
-      ss_isect.hits[hit].t = len(transform_direction(&tfm, D));
-#endif
    }

    /* Quickly retrieve P and Ng without setting up ShaderData. */
--- a/intern/cycles/kernel/integrator/subsurface_random_walk.h
+++ b/intern/cycles/kernel/integrator/subsurface_random_walk.h
@@ -205,12 +205,6 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
  ray.self.light_object = OBJECT_NONE;
  ray.self.light_prim = PRIM_NONE;

-#ifndef __KERNEL_GPU_RAYTRACING__
-  /* Compute or fetch object transforms. */
-  Transform ob_itfm ccl_optional_struct_init;
-  Transform ob_tfm = object_fetch_transform_motion_test(kg, object, time, &ob_itfm);
-#endif
-
  /* Convert subsurface to volume coefficients.
   * The single-scattering albedo is named alpha to avoid confusion with the surface albedo. */
  const float3 albedo = INTEGRATOR_STATE(state, subsurface, albedo);
@@ -383,15 +377,7 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
    hit = (ss_isect.num_hits > 0);

    if (hit) {
-#ifdef __KERNEL_GPU_RAYTRACING__
-      /* t is always in world space with OptiX and MetalRT. */
      ray.tmax = ss_isect.hits[0].t;
-#else
-      /* Compute world space distance to surface hit. */
-      float3 D = transform_direction(&ob_itfm, ray.D);
-      D = normalize(D) * ss_isect.hits[0].t;
-      ray.tmax = len(transform_direction(&ob_tfm, D));
-#endif
    }

    if (bounce == 0) {
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -137,8 +137,9 @@ ccl_device_inline float3 shadow_ray_smooth_surface_offset(
    triangle_vertices_and_normals(kg, sd->prim, V, N);
  }

-  const float u = sd->u, v = sd->v;
-  const float w = 1 - u - v;
+  const float u = 1.0f - sd->u - sd->v;
+  const float v = sd->u;
+  const float w = sd->v;
  float3 P = V[0] * u + V[1] * v + V[2] * w; /* Local space */
  float3 n = N[0] * u + N[1] * v + N[2] * w; /* We get away without normalization */

--- a/intern/cycles/kernel/osl/shaders/node_geometry.osl
+++ b/intern/cycles/kernel/osl/shaders/node_geometry.osl
@@ -20,7 +20,7 @@ shader node_geometry(normal NormalIn = N,
  Normal = NormalIn;
  TrueNormal = Ng;
  Incoming = I;
-  Parametric = point(u, v, 0.0);
+  Parametric = point(1.0 - u - v, u, 0.0);
  Backfacing = backfacing();

  if (bump_offset == "dx") {
--- a/intern/cycles/kernel/svm/geometry.h
+++ b/intern/cycles/kernel/svm/geometry.h
@@ -34,7 +34,7 @@ ccl_device_noinline void svm_node_geometry(KernelGlobals kg,
      data = sd->Ng;
      break;
    case NODE_GEOM_uv:
-      data = make_float3(sd->u, sd->v, 0.0f);
+      data = make_float3(1.0f - sd->u - sd->v, sd->u, 0.0f);
      break;
    default:
      data = make_float3(0.0f, 0.0f, 0.0f);
@@ -57,7 +57,7 @@ ccl_device_noinline void svm_node_geometry_bump_dx(KernelGlobals kg,
      data = sd->P + sd->dP.dx;
      break;
    case NODE_GEOM_uv:
-      data = make_float3(sd->u + sd->du.dx, sd->v + sd->dv.dx, 0.0f);
+      data = make_float3(1.0f - sd->u - sd->du.dx - sd->v - sd->dv.dx, sd->u + sd->du.dx, 0.0f);
      break;
    default:
      svm_node_geometry(kg, sd, stack, type, out_offset);
@@ -84,7 +84,7 @@ ccl_device_noinline void svm_node_geometry_bump_dy(KernelGlobals kg,
      data = sd->P + sd->dP.dy;
      break;
    case NODE_GEOM_uv:
-      data = make_float3(sd->u + sd->du.dy, sd->v + sd->dv.dy, 0.0f);
+      data = make_float3(1.0f - sd->u - sd->du.dy - sd->v - sd->dv.dy, sd->u + sd->du.dy, 0.0f);
      break;
    default:
      svm_node_geometry(kg, sd, stack, type, out_offset);
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -19,10 +19,6 @@

 #include "kernel/svm/types.h"

-#ifndef __KERNEL_GPU__
-#  define __KERNEL_CPU__
-#endif
-
 CCL_NAMESPACE_BEGIN

 /* Constants */
@@ -51,10 +47,10 @@ CCL_NAMESPACE_BEGIN
 #define INTEGRATOR_SHADOW_ISECT_SIZE_CPU 1024U
 #define INTEGRATOR_SHADOW_ISECT_SIZE_GPU 4U

-#ifdef __KERNEL_CPU__
-#  define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_CPU
-#else
+#ifdef __KERNEL_GPU__
 #  define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_GPU
+#else
+#  define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_CPU
 #endif

 /* Kernel features */
@@ -83,7 +79,6 @@ CCL_NAMESPACE_BEGIN
 #define __LAMP_MIS__
 #define __CAMERA_MOTION__
 #define __OBJECT_MOTION__
-#define __BAKING__
 #define __PRINCIPLED__
 #define __SUBSURFACE__
 #define __VOLUME__
@@ -92,16 +87,12 @@ CCL_NAMESPACE_BEGIN
 #define __BRANCHED_PATH__

 /* Device specific features */
-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 #  ifdef WITH_OSL
 #    define __OSL__
 #  endif
 #  define __VOLUME_RECORD_ALL__
-#endif /* __KERNEL_CPU__ */
-
-#ifdef __KERNEL_GPU_RAYTRACING__
-#  undef __BAKING__
-#endif /* __KERNEL_GPU_RAYTRACING__ */
+#endif /* !__KERNEL_GPU__ */

 /* MNEE currently causes "Compute function exceeds available temporary registers"
 * on Metal, disabled for now. */
@@ -129,9 +120,6 @@ CCL_NAMESPACE_BEGIN
 #  if !(__KERNEL_FEATURES & KERNEL_FEATURE_SUBSURFACE)
 #    undef __SUBSURFACE__
 #  endif
-#  if !(__KERNEL_FEATURES & KERNEL_FEATURE_BAKING)
-#    undef __BAKING__
-#  endif
 #  if !(__KERNEL_FEATURES & KERNEL_FEATURE_PATCH_EVALUATION)
 #    undef __PATCH_EVAL__
 #  endif
@@ -730,7 +718,7 @@ typedef struct ccl_align(16) ShaderClosure
 {
  SHADER_CLOSURE_BASE;

-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
  float pad[2];
 #endif
  float data[10];
@@ -1168,7 +1156,7 @@ typedef struct KernelData {
  uint max_shaders;
  uint volume_stack_size;

-  /* Always dynamic data mambers. */
+  /* Always dynamic data members. */
  KernelCamera cam;
  KernelBake bake;
  KernelTables tables;
@@ -1548,15 +1536,15 @@ enum KernelFeatureFlag : uint32_t {
 /* Must be constexpr on the CPU to avoid compile errors because the state types
 * are different depending on the main, shadow or null path. For GPU we don't have
 * C++17 everywhere so can't use it. */
-#ifdef __KERNEL_CPU__
+#ifdef __KERNEL_GPU__
+#  define IF_KERNEL_FEATURE(feature) if ((node_feature_mask & (KERNEL_FEATURE_##feature)) != 0U)
+#  define IF_KERNEL_NODES_FEATURE(feature) \
+    if ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
+#else
 #  define IF_KERNEL_FEATURE(feature) \
    if constexpr ((node_feature_mask & (KERNEL_FEATURE_##feature)) != 0U)
 #  define IF_KERNEL_NODES_FEATURE(feature) \
    if constexpr ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
-#else
-#  define IF_KERNEL_FEATURE(feature) if ((node_feature_mask & (KERNEL_FEATURE_##feature)) != 0U)
-#  define IF_KERNEL_NODES_FEATURE(feature) \
-    if ((node_feature_mask & (KERNEL_FEATURE_NODE_##feature)) != 0U)
 #endif

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/util/profiling.h
+++ b/intern/cycles/kernel/util/profiling.h
@@ -3,13 +3,13 @@

 #pragma once

-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 #  include "util/profiling.h"
 #endif

 CCL_NAMESPACE_BEGIN

-#ifdef __KERNEL_CPU__
+#ifndef __KERNEL_GPU__
 #  define PROFILING_INIT(kg, event) \
    ProfilingHelper profiling_helper((ProfilingState *)&kg->profiler, event)
 #  define PROFILING_EVENT(event) profiling_helper.set_event(event)
@@ -22,6 +22,6 @@ CCL_NAMESPACE_BEGIN
 #  define PROFILING_EVENT(event)
 #  define PROFILING_INIT_FOR_SHADER(kg, event)
 #  define PROFILING_SHADER(object, shader)
-#endif /* __KERNEL_CPU__ */
+#endif /* !__KERNEL_GPU__ */

 CCL_NAMESPACE_END
--- a/intern/cycles/scene/mesh_displace.cpp
+++ b/intern/cycles/scene/mesh_displace.cpp
@@ -73,16 +73,16 @@ static int fill_shader_input(const Scene *scene,

      switch (j) {
        case 0:
-          u = 1.0f;
+          u = 0.0f;
          v = 0.0f;
          break;
        case 1:
-          u = 0.0f;
-          v = 1.0f;
+          u = 1.0f;
+          v = 0.0f;
          break;
        default:
          u = 0.0f;
-          v = 0.0f;
+          v = 1.0f;
          break;
      }

--- a/intern/cycles/session/buffers.cpp
+++ b/intern/cycles/session/buffers.cpp
@@ -209,7 +209,7 @@ const BufferPass *BufferParams::get_actual_display_pass(const BufferPass *pass)
    return nullptr;
  }

-  if (pass->type == PASS_COMBINED) {
+  if (pass->type == PASS_COMBINED && pass->lightgroup.empty()) {
    const BufferPass *shadow_catcher_matte_pass = find_pass(PASS_SHADOW_CATCHER_MATTE, pass->mode);
    if (shadow_catcher_matte_pass) {
      pass = shadow_catcher_matte_pass;
--- a/intern/cycles/test/util_avxf_avx2_test.cpp
+++ b/intern/cycles/test/util_avxf_avx2_test.cpp
@@ -2,7 +2,6 @@
 * Copyright 2011-2022 Blender Foundation */

 #define __KERNEL_AVX2__
-#define __KERNEL_CPU__

 #define TEST_CATEGORY_NAME util_avx2

--- a/intern/cycles/test/util_avxf_avx_test.cpp
+++ b/intern/cycles/test/util_avxf_avx_test.cpp
@@ -2,7 +2,6 @@
 * Copyright 2011-2022 Blender Foundation */

 #define __KERNEL_AVX__
-#define __KERNEL_CPU__

 #define TEST_CATEGORY_NAME util_avx

--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -63,6 +63,7 @@ set(SRC_HEADERS
  math_float2.h
  math_float3.h
  math_float4.h
+  math_float8.h
  math_int2.h
  math_int3.h
  math_int4.h
@@ -128,8 +129,6 @@ set(SRC_HEADERS
  types_uint4.h
  types_uint4_impl.h
  types_ushort4.h
-  types_vector3.h
-  types_vector3_impl.h
  unique_ptr.h
  vector.h
  version.h
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -81,7 +81,7 @@
 /* macros */

 /* hints for branch prediction, only use in code that runs a _lot_ */
-#if defined(__GNUC__) && defined(__KERNEL_CPU__)
+#if defined(__GNUC__) && !defined(__KERNEL_GPU__)
 #  define LIKELY(x) __builtin_expect(!!(x), 1)
 #  define UNLIKELY(x) __builtin_expect(!!(x), 0)
 #else
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -511,6 +511,11 @@ ccl_device_inline float4 float3_to_float4(const float3 a)
  return make_float4(a.x, a.y, a.z, 1.0f);
 }

+ccl_device_inline float4 float3_to_float4(const float3 a, const float w)
+{
+  return make_float4(a.x, a.y, a.z, w);
+}
+
 ccl_device_inline float inverse_lerp(float a, float b, float x)
 {
  return (x - a) / (b - a);
@@ -535,6 +540,7 @@ CCL_NAMESPACE_END
 #include "util/math_float2.h"
 #include "util/math_float3.h"
 #include "util/math_float4.h"
+#include "util/math_float8.h"

 #include "util/rect.h"

@@ -947,7 +953,11 @@ ccl_device_inline uint prev_power_of_two(uint x)
 ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
 {
  /* Use a native instruction if it exists. */
-#if defined(__aarch64__) || defined(_M_ARM64)
+#if defined(__KERNEL_CUDA__)
+  return __brev(x);
+#elif defined(__KERNEL_METAL__)
+  return reverse_bits(x);
+#elif defined(__aarch64__) || defined(_M_ARM64)
  /* Assume the rbit is always available on 64bit ARM architecture. */
  __asm__("rbit %w0, %w1" : "=r"(x) : "r"(x));
  return x;
@@ -956,10 +966,6 @@ ccl_device_inline uint32_t reverse_integer_bits(uint32_t x)
   * This 32-bit Thumb instruction is available in ARMv6T2 and above. */
  __asm__("rbit %0, %1" : "=r"(x) : "r"(x));
  return x;
-#elif defined(__KERNEL_CUDA__)
-  return __brev(x);
-#elif defined(__KERNEL_METAL__)
-  return reverse_bits(x);
 #elif __has_builtin(__builtin_bitreverse32)
  return __builtin_bitreverse32(x);
 #else
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@@ -420,7 +420,7 @@ ccl_device_inline float fast_expf(float x)
  return fast_exp2f(x / M_LN2_F);
 }

-#if defined(__KERNEL_CPU__) && !defined(_MSC_VER)
+#if !defined(__KERNEL_GPU__) && !defined(_MSC_VER)
 /* MSVC seems to have a code-gen bug here in at least SSE41/AVX, see
 * T78047 and T78869 for details. Just disable for now, it only makes
 * a small difference in denoising performance. */
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -147,8 +147,11 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)

 ccl_device_inline float3 operator/(const float3 &a, const float f)
 {
-  float invf = 1.0f / f;
-  return a * invf;
+#  if defined(__KERNEL_SSE__)
+  return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
+#  else
+  return make_float3(a.x / f, a.y / f, a.z / f);
+#  endif
 }

 ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
@@ -284,8 +287,12 @@ ccl_device_inline float dot_xy(const float3 &a, const float3 &b)

 ccl_device_inline float3 cross(const float3 &a, const float3 &b)
 {
-  float3 r = make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
-  return r;
+#  ifdef __KERNEL_SSE__
+  return float3(shuffle<1, 2, 0, 3>(
+      msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+#  else
+  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+#  endif
 }

 ccl_device_inline float3 normalize(const float3 &a)
--- a/intern/cycles/util/math_float8.h
+++ b/intern/cycles/util/math_float8.h
@@ -0,0 +1,419 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2022 Blender Foundation */
+
+#ifndef __UTIL_MATH_FLOAT8_H__
+#define __UTIL_MATH_FLOAT8_H__
+
+#ifndef __UTIL_MATH_H__
+#  error "Do not include this file directly, include util/types.h instead."
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+/*******************************************************************************
+ * Declaration.
+ */
+
+ccl_device_inline float8_t operator+(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator+(const float8_t a, const float f);
+ccl_device_inline float8_t operator+(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator-(const float8_t a);
+ccl_device_inline float8_t operator-(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator-(const float8_t a, const float f);
+ccl_device_inline float8_t operator-(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator*(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator*(const float8_t a, const float f);
+ccl_device_inline float8_t operator*(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator/(const float8_t a, const float8_t b);
+ccl_device_inline float8_t operator/(const float8_t a, float f);
+ccl_device_inline float8_t operator/(const float f, const float8_t a);
+
+ccl_device_inline float8_t operator+=(float8_t a, const float8_t b);
+
+ccl_device_inline float8_t operator*=(float8_t a, const float8_t b);
+ccl_device_inline float8_t operator*=(float8_t a, float f);
+
+ccl_device_inline float8_t operator/=(float8_t a, float f);
+
+ccl_device_inline bool operator==(const float8_t a, const float8_t b);
+
+ccl_device_inline float8_t rcp(const float8_t a);
+ccl_device_inline float8_t sqrt(const float8_t a);
+ccl_device_inline float8_t sqr(const float8_t a);
+ccl_device_inline bool is_zero(const float8_t a);
+ccl_device_inline float average(const float8_t a);
+ccl_device_inline float8_t min(const float8_t a, const float8_t b);
+ccl_device_inline float8_t max(const float8_t a, const float8_t b);
+ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx);
+ccl_device_inline float8_t fabs(const float8_t a);
+ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t);
+ccl_device_inline float8_t saturate(const float8_t a);
+
+ccl_device_inline float8_t safe_divide(const float8_t a, const float b);
+ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b);
+
+ccl_device_inline float reduce_min(const float8_t a);
+ccl_device_inline float reduce_max(const float8_t a);
+ccl_device_inline float reduce_add(const float8_t a);
+
+ccl_device_inline bool isequal(const float8_t a, const float8_t b);
+
+/*******************************************************************************
+ * Definition.
+ */
+
+ccl_device_inline float8_t zero_float8_t()
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_setzero_ps());
+#else
+  return make_float8_t(0.0f);
+#endif
+}
+
+ccl_device_inline float8_t one_float8_t()
+{
+  return make_float8_t(1.0f);
+}
+
+ccl_device_inline float8_t operator+(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_add_ps(a.m256, b.m256));
+#else
+  return make_float8_t(
+      a.a + b.a, a.b + b.b, a.c + b.c, a.d + b.d, a.e + b.e, a.f + b.f, a.g + b.g, a.h + b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator+(const float8_t a, const float f)
+{
+  return a + make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator+(const float f, const float8_t a)
+{
+  return make_float8_t(f) + a;
+}
+
+ccl_device_inline float8_t operator-(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+  __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+  return float8_t(_mm256_xor_ps(a.m256, mask));
+#else
+  return make_float8_t(-a.a, -a.b, -a.c, -a.d, -a.e, -a.f, -a.g, -a.h);
+#endif
+}
+
+ccl_device_inline float8_t operator-(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_sub_ps(a.m256, b.m256));
+#else
+  return make_float8_t(
+      a.a - b.a, a.b - b.b, a.c - b.c, a.d - b.d, a.e - b.e, a.f - b.f, a.g - b.g, a.h - b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator-(const float8_t a, const float f)
+{
+  return a - make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator-(const float f, const float8_t a)
+{
+  return make_float8_t(f) - a;
+}
+
+ccl_device_inline float8_t operator*(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_mul_ps(a.m256, b.m256));
+#else
+  return make_float8_t(
+      a.a * b.a, a.b * b.b, a.c * b.c, a.d * b.d, a.e * b.e, a.f * b.f, a.g * b.g, a.h * b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator*(const float8_t a, const float f)
+{
+  return a * make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator*(const float f, const float8_t a)
+{
+  return make_float8_t(f) * a;
+}
+
+ccl_device_inline float8_t operator/(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_div_ps(a.m256, b.m256));
+#else
+  return make_float8_t(
+      a.a / b.a, a.b / b.b, a.c / b.c, a.d / b.d, a.e / b.e, a.f / b.f, a.g / b.g, a.h / b.h);
+#endif
+}
+
+ccl_device_inline float8_t operator/(const float8_t a, const float f)
+{
+  return a / make_float8_t(f);
+}
+
+ccl_device_inline float8_t operator/(const float f, const float8_t a)
+{
+  return make_float8_t(f) / a;
+}
+
+ccl_device_inline float8_t operator+=(float8_t a, const float8_t b)
+{
+  return a = a + b;
+}
+
+ccl_device_inline float8_t operator-=(float8_t a, const float8_t b)
+{
+  return a = a - b;
+}
+
+ccl_device_inline float8_t operator*=(float8_t a, const float8_t b)
+{
+  return a = a * b;
+}
+
+ccl_device_inline float8_t operator*=(float8_t a, float f)
+{
+  return a = a * f;
+}
+
+ccl_device_inline float8_t operator/=(float8_t a, float f)
+{
+  return a = a / f;
+}
+
+ccl_device_inline bool operator==(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return (_mm256_movemask_ps(_mm256_castsi256_ps(
+              _mm256_cmpeq_epi32(_mm256_castps_si256(a.m256), _mm256_castps_si256(b.m256)))) &
+          0b11111111) == 0b11111111;
+#else
+  return (a.a == b.a && a.b == b.b && a.c == b.c && a.d == b.d && a.e == b.e && a.f == b.f &&
+          a.g == b.g && a.h == b.h);
+#endif
+}
+
+ccl_device_inline float8_t rcp(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_rcp_ps(a.m256));
+#else
+  return make_float8_t(1.0f / a.a,
+                       1.0f / a.b,
+                       1.0f / a.c,
+                       1.0f / a.d,
+                       1.0f / a.e,
+                       1.0f / a.f,
+                       1.0f / a.g,
+                       1.0f / a.h);
+#endif
+}
+
+ccl_device_inline float8_t sqrt(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_sqrt_ps(a.m256));
+#else
+  return make_float8_t(sqrtf(a.a),
+                       sqrtf(a.b),
+                       sqrtf(a.c),
+                       sqrtf(a.d),
+                       sqrtf(a.e),
+                       sqrtf(a.f),
+                       sqrtf(a.g),
+                       sqrtf(a.h));
+#endif
+}
+
+ccl_device_inline float8_t sqr(const float8_t a)
+{
+  return a * a;
+}
+
+ccl_device_inline bool is_zero(const float8_t a)
+{
+  return a == make_float8_t(0.0f);
+}
+
+ccl_device_inline float average(const float8_t a)
+{
+  return reduce_add(a) / 8.0f;
+}
+
+ccl_device_inline float8_t min(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_min_ps(a.m256, b.m256));
+#else
+  return make_float8_t(min(a.a, b.a),
+                       min(a.b, b.b),
+                       min(a.c, b.c),
+                       min(a.d, b.d),
+                       min(a.e, b.e),
+                       min(a.f, b.f),
+                       min(a.g, b.g),
+                       min(a.h, b.h));
+#endif
+}
+
+ccl_device_inline float8_t max(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_max_ps(a.m256, b.m256));
+#else
+  return make_float8_t(max(a.a, b.a),
+                       max(a.b, b.b),
+                       max(a.c, b.c),
+                       max(a.d, b.d),
+                       max(a.e, b.e),
+                       max(a.f, b.f),
+                       max(a.g, b.g),
+                       max(a.h, b.h));
+#endif
+}
+
+ccl_device_inline float8_t clamp(const float8_t a, const float8_t mn, const float8_t mx)
+{
+  return min(max(a, mn), mx);
+}
+
+ccl_device_inline float8_t fabs(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+  return float8_t(_mm256_and_ps(a.m256, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff))));
+#else
+  return make_float8_t(fabsf(a.a),
+                       fabsf(a.b),
+                       fabsf(a.c),
+                       fabsf(a.d),
+                       fabsf(a.e),
+                       fabsf(a.f),
+                       fabsf(a.g),
+                       fabsf(a.h));
+#endif
+}
+
+ccl_device_inline float8_t mix(const float8_t a, const float8_t b, float t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float8_t saturate(const float8_t a)
+{
+  return clamp(a, make_float8_t(0.0f), make_float8_t(1.0f));
+}
+
+ccl_device_inline float8_t exp(float8_t v)
+{
+  return make_float8_t(
+      expf(v.a), expf(v.b), expf(v.c), expf(v.d), expf(v.e), expf(v.f), expf(v.g), expf(v.h));
+}
+
+ccl_device_inline float8_t log(float8_t v)
+{
+  return make_float8_t(
+      logf(v.a), logf(v.b), logf(v.c), logf(v.d), logf(v.e), logf(v.f), logf(v.g), logf(v.h));
+}
+
+ccl_device_inline float dot(const float8_t a, const float8_t b)
+{
+#ifdef __KERNEL_AVX2__
+  float8_t t(_mm256_dp_ps(a.m256, b.m256, 0xFF));
+  return t[0] + t[4];
+#else
+  return (a.a * b.a) + (a.b * b.b) + (a.c * b.c) + (a.d * b.d) + (a.e * b.e) + (a.f * b.f) +
+         (a.g * b.g) + (a.h * b.h);
+#endif
+}
+
+ccl_device_inline float8_t pow(float8_t v, float e)
+{
+  return make_float8_t(powf(v.a, e),
+                       powf(v.b, e),
+                       powf(v.c, e),
+                       powf(v.d, e),
+                       powf(v.e, e),
+                       powf(v.f, e),
+                       powf(v.g, e),
+                       powf(v.h, e));
+}
+
+ccl_device_inline float reduce_min(const float8_t a)
+{
+  return min(min(min(a.a, a.b), min(a.c, a.d)), min(min(a.e, a.f), min(a.g, a.h)));
+}
+
+ccl_device_inline float reduce_max(const float8_t a)
+{
+  return max(max(max(a.a, a.b), max(a.c, a.d)), max(max(a.e, a.f), max(a.g, a.h)));
+}
+
+ccl_device_inline float reduce_add(const float8_t a)
+{
+#ifdef __KERNEL_AVX2__
+  float8_t b(_mm256_hadd_ps(a.m256, a.m256));
+  float8_t h(_mm256_hadd_ps(b.m256, b.m256));
+  return h[0] + h[4];
+#else
+  return a.a + a.b + a.c + a.d + a.e + a.f + a.g + a.h;
+#endif
+}
+
+ccl_device_inline bool isequal(const float8_t a, const float8_t b)
+{
+  return a == b;
+}
+
+ccl_device_inline float8_t safe_divide(const float8_t a, const float b)
+{
+  return (b != 0.0f) ? a / b : make_float8_t(0.0f);
+}
+
+ccl_device_inline float8_t safe_divide(const float8_t a, const float8_t b)
+{
+  return make_float8_t((b.a != 0.0f) ? a.a / b.a : 0.0f,
+                       (b.b != 0.0f) ? a.b / b.b : 0.0f,
+                       (b.c != 0.0f) ? a.c / b.c : 0.0f,
+                       (b.d != 0.0f) ? a.d / b.d : 0.0f,
+                       (b.e != 0.0f) ? a.e / b.e : 0.0f,
+                       (b.f != 0.0f) ? a.f / b.f : 0.0f,
+                       (b.g != 0.0f) ? a.g / b.g : 0.0f,
+                       (b.h != 0.0f) ? a.h / b.h : 0.0f);
+}
+
+ccl_device_inline float8_t ensure_finite(float8_t v)
+{
+  v.a = ensure_finite(v.a);
+  v.b = ensure_finite(v.b);
+  v.c = ensure_finite(v.c);
+  v.d = ensure_finite(v.d);
+  v.e = ensure_finite(v.e);
+  v.f = ensure_finite(v.f);
+  v.g = ensure_finite(v.g);
+  v.h = ensure_finite(v.h);
+
+  return v;
+}
+
+ccl_device_inline bool isfinite_safe(float8_t v)
+{
+  return isfinite_safe(v.a) && isfinite_safe(v.b) && isfinite_safe(v.c) && isfinite_safe(v.d) &&
+         isfinite_safe(v.e) && isfinite_safe(v.f) && isfinite_safe(v.g) && isfinite_safe(v.h);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_MATH_FLOAT8_H__ */
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -105,10 +105,10 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
  return false;
 }

-ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
-                                                   float3 ray_dir,
-                                                   float ray_tmin,
-                                                   float ray_tmax,
+ccl_device_forceinline bool ray_triangle_intersect(const float3 ray_P,
+                                                   const float3 ray_D,
+                                                   const float ray_tmin,
+                                                   const float ray_tmax,
                                                   const float3 tri_a,
                                                   const float3 tri_b,
                                                   const float3 tri_c,
@@ -116,14 +116,13 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
                                                   ccl_private float *isect_v,
                                                   ccl_private float *isect_t)
 {
-#define dot3(a, b) dot(a, b)
-  const float3 P = ray_P;
-  const float3 dir = ray_dir;
+  /* This implementation matches the Plücker coordinates triangle intersection
+   * in Embree. */

  /* Calculate vertices relative to ray origin. */
-  const float3 v0 = tri_c - P;
-  const float3 v1 = tri_a - P;
-  const float3 v2 = tri_b - P;
+  const float3 v0 = tri_a - ray_P;
+  const float3 v1 = tri_b - ray_P;
+  const float3 v2 = tri_c - ray_P;

  /* Calculate triangle edges. */
  const float3 e0 = v2 - v0;
@@ -131,40 +130,40 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
  const float3 e2 = v1 - v2;

  /* Perform edge tests. */
-  const float U = dot(cross(v2 + v0, e0), ray_dir);
-  const float V = dot(cross(v0 + v1, e1), ray_dir);
-  const float W = dot(cross(v1 + v2, e2), ray_dir);
+  const float U = dot(cross(e0, v2 + v0), ray_D);
+  const float V = dot(cross(e1, v0 + v1), ray_D);
+  const float W = dot(cross(e2, v1 + v2), ray_D);

+  const float UVW = U + V + W;
+  const float eps = FLT_EPSILON * fabsf(UVW);
  const float minUVW = min(U, min(V, W));
  const float maxUVW = max(U, max(V, W));

-  if (minUVW < 0.0f && maxUVW > 0.0f) {
+  if (!(minUVW >= -eps || maxUVW <= eps)) {
    return false;
  }

  /* Calculate geometry normal and denominator. */
  const float3 Ng1 = cross(e1, e0);
-  // const Vec3vfM Ng1 = stable_triangle_normal(e2,e1,e0);
  const float3 Ng = Ng1 + Ng1;
-  const float den = dot3(Ng, dir);
+  const float den = dot(Ng, ray_D);
  /* Avoid division by 0. */
  if (UNLIKELY(den == 0.0f)) {
    return false;
  }

  /* Perform depth test. */
-  const float T = dot3(v0, Ng);
+  const float T = dot(v0, Ng);
  const float t = T / den;
  if (!(t >= ray_tmin && t <= ray_tmax)) {
    return false;
  }

-  *isect_u = U / den;
-  *isect_v = V / den;
+  const float rcp_UVW = (fabsf(UVW) < 1e-18f) ? 0.0f : 1.0f / UVW;
+  *isect_u = min(U * rcp_UVW, 1.0f);
+  *isect_v = min(V * rcp_UVW, 1.0f);
  *isect_t = t;
  return true;
-
-#undef dot3
 }

 /* Tests for an intersection between a ray and a quad defined by
--- a/intern/cycles/util/transform.cpp
+++ b/intern/cycles/util/transform.cpp
@@ -99,15 +99,7 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
  memcpy(M, &tfm, sizeof(M));

  if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
-    /* matrix is degenerate (e.g. 0 scale on some axis), ideally we should
-     * never be in this situation, but try to invert it anyway with tweak */
-    M[0][0] += 1e-8f;
-    M[1][1] += 1e-8f;
-    M[2][2] += 1e-8f;
-
-    if (UNLIKELY(!transform_matrix4_gj_inverse(R, M))) {
-      return projection_identity();
-    }
+    return projection_identity();
  }

  memcpy(&tfmR, R, sizeof(R));
@@ -115,16 +107,9 @@ ProjectionTransform projection_inverse(const ProjectionTransform &tfm)
  return tfmR;
 }

-Transform transform_inverse(const Transform &tfm)
-{
-  ProjectionTransform projection(tfm);
-  return projection_to_transform(projection_inverse(projection));
-}
-
 Transform transform_transposed_inverse(const Transform &tfm)
 {
-  ProjectionTransform projection(tfm);
-  ProjectionTransform iprojection = projection_inverse(projection);
+  ProjectionTransform iprojection(transform_inverse(tfm));
  return projection_to_transform(projection_transpose(iprojection));
 }

--- a/intern/cycles/util/transform.h
+++ b/intern/cycles/util/transform.h
@@ -63,10 +63,10 @@ ccl_device_inline float3 transform_point(ccl_private const Transform *t, const f

  _MM_TRANSPOSE4_PS(x, y, z, w);

-  ssef tmp = shuffle<0>(aa) * x;
-  tmp = madd(shuffle<1>(aa), y, tmp);
+  ssef tmp = w;
  tmp = madd(shuffle<2>(aa), z, tmp);
-  tmp += w;
+  tmp = madd(shuffle<1>(aa), y, tmp);
+  tmp = madd(shuffle<0>(aa), x, tmp);

  return float3(tmp.m128);
 #elif defined(__KERNEL_METAL__)
@@ -93,9 +93,9 @@ ccl_device_inline float3 transform_direction(ccl_private const Transform *t, con

  _MM_TRANSPOSE4_PS(x, y, z, w);

-  ssef tmp = shuffle<0>(aa) * x;
+  ssef tmp = shuffle<2>(aa) * z;
  tmp = madd(shuffle<1>(aa), y, tmp);
-  tmp = madd(shuffle<2>(aa), z, tmp);
+  tmp = madd(shuffle<0>(aa), x, tmp);

  return float3(tmp.m128);
 #elif defined(__KERNEL_METAL__)
@@ -312,7 +312,6 @@ ccl_device_inline void transform_set_column(Transform *t, int column, float3 val
  t->z[column] = value.z;
 }

-Transform transform_inverse(const Transform &a);
 Transform transform_transposed_inverse(const Transform &a);

 ccl_device_inline bool transform_uniform_scale(const Transform &tfm, float &scale)
@@ -392,39 +391,47 @@ ccl_device_inline float4 quat_interpolate(float4 q1, float4 q2, float t)
 #endif /* defined(__KERNEL_GPU_RAYTRACING__) */
 }

-ccl_device_inline Transform transform_quick_inverse(Transform M)
+ccl_device_inline Transform transform_inverse(const Transform tfm)
 {
-  /* possible optimization: can we avoid doing this altogether and construct
-   * the inverse matrix directly from negated translation, transposed rotation,
-   * scale can be inverted but what about shearing? */
-  Transform R;
-  float det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
-              M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
+  /* This implementation matches the one in Embree exactly, to ensure consistent
+   * results with the ray intersection of instances. */
+  float3 x = make_float3(tfm.x.x, tfm.y.x, tfm.z.x);
+  float3 y = make_float3(tfm.x.y, tfm.y.y, tfm.z.y);
+  float3 z = make_float3(tfm.x.z, tfm.y.z, tfm.z.z);
+  float3 w = make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
+
+  /* Compute determinant. */
+  float det = dot(x, cross(y, z));
+
  if (det == 0.0f) {
-    M.x.x += 1e-8f;
-    M.y.y += 1e-8f;
-    M.z.z += 1e-8f;
-    det = M.x.x * (M.z.z * M.y.y - M.z.y * M.y.z) - M.y.x * (M.z.z * M.x.y - M.z.y * M.x.z) +
-          M.z.x * (M.y.z * M.x.y - M.y.y * M.x.z);
+    /* Matrix is degenerate (e.g. 0 scale on some axis), ideally we should
+     * never be in this situation, but try to invert it anyway with tweak.
+     *
+     * This logic does not match Embree which would just give an invalid
+     * matrix. A better solution would be to remove this and ensure any object
+     * matrix is valid. */
+    x.x += 1e-8f;
+    y.y += 1e-8f;
+    z.z += 1e-8f;
+
+    det = dot(x, cross(y, z));
+    if (det == 0.0f) {
+      det = FLT_MAX;
+    }
  }
-  det = (det != 0.0f) ? 1.0f / det : 0.0f;

-  float3 Rx = det * make_float3(M.z.z * M.y.y - M.z.y * M.y.z,
-                                M.z.y * M.x.z - M.z.z * M.x.y,
-                                M.y.z * M.x.y - M.y.y * M.x.z);
-  float3 Ry = det * make_float3(M.z.x * M.y.z - M.z.z * M.y.x,
-                                M.z.z * M.x.x - M.z.x * M.x.z,
-                                M.y.x * M.x.z - M.y.z * M.x.x);
-  float3 Rz = det * make_float3(M.z.y * M.y.x - M.z.x * M.y.y,
-                                M.z.x * M.x.y - M.z.y * M.x.x,
-                                M.y.y * M.x.x - M.y.x * M.x.y);
-  float3 T = -make_float3(M.x.w, M.y.w, M.z.w);
+  /* Divide adjoint matrix by the determinant to compute inverse of 3x3 matrix. */
+  const float3 inverse_x = cross(y, z) / det;
+  const float3 inverse_y = cross(z, x) / det;
+  const float3 inverse_z = cross(x, y) / det;

-  R.x = make_float4(Rx.x, Rx.y, Rx.z, dot(Rx, T));
-  R.y = make_float4(Ry.x, Ry.y, Ry.z, dot(Ry, T));
-  R.z = make_float4(Rz.x, Rz.y, Rz.z, dot(Rz, T));
+  /* Compute translation and fill transform. */
+  Transform itfm;
+  itfm.x = float3_to_float4(inverse_x, -dot(inverse_x, w));
+  itfm.y = float3_to_float4(inverse_y, -dot(inverse_y, w));
+  itfm.z = float3_to_float4(inverse_z, -dot(inverse_z, w));

-  return R;
+  return itfm;
 }

 ccl_device_inline void transform_compose(ccl_private Transform *tfm,
--- a/intern/cycles/util/types.h
+++ b/intern/cycles/util/types.h
@@ -12,6 +12,7 @@

 #if !defined(__KERNEL_GPU__)
 #  include <stdint.h>
+#  include <stdio.h>
 #endif

 #include "util/defines.h"
@@ -70,6 +71,12 @@ ccl_device_inline bool is_power_of_two(size_t x)

 CCL_NAMESPACE_END

+/* Most GPU APIs matching native vector types, so we only need to implement them for
+ * CPU and oneAPI. */
+#if defined(__KERNEL_GPU__) && !defined(__KERNEL_ONEAPI__)
+#  define __KERNEL_NATIVE_VECTOR_TYPES__
+#endif
+
 /* Vectorized types declaration. */
 #include "util/types_uchar2.h"
 #include "util/types_uchar3.h"
@@ -90,8 +97,6 @@ CCL_NAMESPACE_END
 #include "util/types_float4.h"
 #include "util/types_float8.h"

-#include "util/types_vector3.h"
-
 /* Vectorized types implementation. */
 #include "util/types_uchar2_impl.h"
 #include "util/types_uchar3_impl.h"
@@ -110,8 +115,6 @@ CCL_NAMESPACE_END
 #include "util/types_float4_impl.h"
 #include "util/types_float8_impl.h"

-#include "util/types_vector3_impl.h"
-
 /* SSE types. */
 #ifndef __KERNEL_GPU__
 #  include "util/sseb.h"
--- a/intern/cycles/util/types_float2.h
+++ b/intern/cycles/util/types_float2.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#ifndef __UTIL_TYPES_FLOAT2_H__
-#define __UTIL_TYPES_FLOAT2_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,18 +9,18 @@

 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct float2 {
  float x, y;

+#  ifndef __KERNEL_GPU__
  __forceinline float operator[](int i) const;
  __forceinline float &operator[](int i);
+#  endif
 };

 ccl_device_inline float2 make_float2(float x, float y);
 ccl_device_inline void print_float2(const char *label, const float2 &a);
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT2_H__ */
--- a/intern/cycles/util/types_float2_impl.h
+++ b/intern/cycles/util/types_float2_impl.h
@@ -1,20 +1,16 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#ifndef __UTIL_TYPES_FLOAT2_IMPL_H__
-#define __UTIL_TYPES_FLOAT2_IMPL_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
 #endif

-#ifndef __KERNEL_GPU__
-#  include <cstdio>
-#endif
-
 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+#  ifndef __KERNEL_GPU__
 __forceinline float float2::operator[](int i) const
 {
  util_assert(i >= 0);
@@ -28,6 +24,7 @@ __forceinline float &float2::operator[](int i)
  util_assert(i < 2);
  return *(&x + i);
 }
+#  endif

 ccl_device_inline float2 make_float2(float x, float y)
 {
@@ -39,8 +36,6 @@ ccl_device_inline void print_float2(const char *label, const float2 &a)
 {
  printf("%s: %.8f %.8f\n", label, (double)a.x, (double)a.y);
 }
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT2_IMPL_H__ */
--- a/intern/cycles/util/types_float3.h
+++ b/intern/cycles/util/types_float3.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#ifndef __UTIL_TYPES_FLOAT3_H__
-#define __UTIL_TYPES_FLOAT3_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,28 @@

 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__)
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct ccl_try_align(16) float3
 {
-#  ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_GPU__
+  /* Compact structure for GPU. */
+  float x, y, z;
+#  else
+  /* SIMD aligned structure for CPU. */
+#    ifdef __KERNEL_SSE__
  union {
    __m128 m128;
    struct {
      float x, y, z, w;
    };
  };
+#    else
+  float x, y, z, w;
+#    endif
+#  endif

+#  ifdef __KERNEL_SSE__
+  /* Convenient constructors and operators for SIMD, otherwise default is enough. */
  __forceinline float3();
  __forceinline float3(const float3 &a);
  __forceinline explicit float3(const __m128 &a);
@@ -29,18 +39,18 @@ struct ccl_try_align(16) float3
  __forceinline operator __m128 &();

  __forceinline float3 &operator=(const float3 &a);
-#  else  /* __KERNEL_SSE__ */
-  float x, y, z, w;
-#  endif /* __KERNEL_SSE__ */
+#  endif

+#  ifndef __KERNEL_GPU__
  __forceinline float operator[](int i) const;
  __forceinline float &operator[](int i);
+#  endif
 };

 ccl_device_inline float3 make_float3(float f);
 ccl_device_inline float3 make_float3(float x, float y, float z);
 ccl_device_inline void print_float3(const char *label, const float3 &a);
-#endif /* !defined(__KERNEL_GPU__) */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 /* Smaller float3 for storage. For math operations this must be converted to float3, so that on the
 * CPU SIMD instructions can be used. */
@@ -78,5 +88,3 @@ struct packed_float3 {
 static_assert(sizeof(packed_float3) == 12, "packed_float3 expected to be exactly 12 bytes");

 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT3_H__ */
--- a/intern/cycles/util/types_float3_impl.h
+++ b/intern/cycles/util/types_float3_impl.h
@@ -1,20 +1,15 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#ifndef __UTIL_TYPES_FLOAT3_IMPL_H__
-#define __UTIL_TYPES_FLOAT3_IMPL_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
 #endif

-#ifndef __KERNEL_GPU__
-#  include <cstdio>
-#endif
-
 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__)
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 #  ifdef __KERNEL_SSE__
 __forceinline float3::float3()
 {
@@ -45,6 +40,7 @@ __forceinline float3 &float3::operator=(const float3 &a)
 }
 #  endif /* __KERNEL_SSE__ */

+#  ifndef __KERNEL_GPU__
 __forceinline float float3::operator[](int i) const
 {
  util_assert(i >= 0);
@@ -58,23 +54,32 @@ __forceinline float &float3::operator[](int i)
  util_assert(i < 3);
  return *(&x + i);
 }
+#  endif

 ccl_device_inline float3 make_float3(float f)
 {
-#  ifdef __KERNEL_SSE__
-  float3 a(_mm_set1_ps(f));
+#  ifdef __KERNEL_GPU__
+  float3 a = {f, f, f};
 #  else
+#    ifdef __KERNEL_SSE__
+  float3 a(_mm_set1_ps(f));
+#    else
  float3 a = {f, f, f, f};
+#    endif
 #  endif
  return a;
 }

 ccl_device_inline float3 make_float3(float x, float y, float z)
 {
-#  ifdef __KERNEL_SSE__
-  float3 a(_mm_set_ps(0.0f, z, y, x));
+#  ifdef __KERNEL_GPU__
+  float3 a = {x, y, z};
 #  else
+#    ifdef __KERNEL_SSE__
+  float3 a(_mm_set_ps(0.0f, z, y, x));
+#    else
  float3 a = {x, y, z, 0.0f};
+#    endif
 #  endif
  return a;
 }
@@ -83,8 +88,6 @@ ccl_device_inline void print_float3(const char *label, const float3 &a)
 {
  printf("%s: %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z);
 }
-#endif /* !defined(__KERNEL_GPU__) */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT3_IMPL_H__ */
--- a/intern/cycles/util/types_float4.h
+++ b/intern/cycles/util/types_float4.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#ifndef __UTIL_TYPES_FLOAT4_H__
-#define __UTIL_TYPES_FLOAT4_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,7 @@

 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct int4;

 struct ccl_try_align(16) float4
@@ -35,16 +34,16 @@ struct ccl_try_align(16) float4
  float x, y, z, w;
 #  endif /* __KERNEL_SSE__ */

+#  ifndef __KERNEL_GPU__
  __forceinline float operator[](int i) const;
  __forceinline float &operator[](int i);
+#  endif
 };

 ccl_device_inline float4 make_float4(float f);
 ccl_device_inline float4 make_float4(float x, float y, float z, float w);
 ccl_device_inline float4 make_float4(const int4 &i);
 ccl_device_inline void print_float4(const char *label, const float4 &a);
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT4_H__ */
--- a/intern/cycles/util/types_float4_impl.h
+++ b/intern/cycles/util/types_float4_impl.h
@@ -1,20 +1,15 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#ifndef __UTIL_TYPES_FLOAT4_IMPL_H__
-#define __UTIL_TYPES_FLOAT4_IMPL_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
 #endif

-#ifndef __KERNEL_GPU__
-#  include <cstdio>
-#endif
-
 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 #  ifdef __KERNEL_SSE__
 __forceinline float4::float4()
 {
@@ -41,6 +36,7 @@ __forceinline float4 &float4::operator=(const float4 &a)
 }
 #  endif /* __KERNEL_SSE__ */

+#  ifndef __KERNEL_GPU__
 __forceinline float float4::operator[](int i) const
 {
  util_assert(i >= 0);
@@ -54,6 +50,7 @@ __forceinline float &float4::operator[](int i)
  util_assert(i < 4);
  return *(&x + i);
 }
+#  endif

 ccl_device_inline float4 make_float4(float f)
 {
@@ -89,8 +86,6 @@ ccl_device_inline void print_float4(const char *label, const float4 &a)
 {
  printf("%s: %.8f %.8f %.8f %.8f\n", label, (double)a.x, (double)a.y, (double)a.z, (double)a.w);
 }
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT4_IMPL_H__ */
--- a/intern/cycles/util/types_float8.h
+++ b/intern/cycles/util/types_float8.h
@@ -2,8 +2,7 @@
 * Original code Copyright 2017, Intel Corporation
 * Modifications Copyright 2018-2022 Blender Foundation. */

-#ifndef __UTIL_TYPES_FLOAT8_H__
-#define __UTIL_TYPES_FLOAT8_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -11,11 +10,16 @@

 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+/* float8 is a reserved type in Metal that has not been implemented. For
+ * that reason this is named float8_t and not using native vector types. */

-struct ccl_try_align(32) float8
+#ifdef __KERNEL_GPU__
+struct float8_t
+#else
+struct ccl_try_align(32) float8_t
+#endif
 {
-#  ifdef __KERNEL_AVX2__
+#ifdef __KERNEL_AVX2__
  union {
    __m256 m256;
    struct {
@@ -23,28 +27,27 @@ struct ccl_try_align(32) float8
    };
  };

-  __forceinline float8();
-  __forceinline float8(const float8 &a);
-  __forceinline explicit float8(const __m256 &a);
+  __forceinline float8_t();
+  __forceinline float8_t(const float8_t &a);
+  __forceinline explicit float8_t(const __m256 &a);

  __forceinline operator const __m256 &() const;
  __forceinline operator __m256 &();

-  __forceinline float8 &operator=(const float8 &a);
+  __forceinline float8_t &operator=(const float8_t &a);

-#  else  /* __KERNEL_AVX2__ */
+#else  /* __KERNEL_AVX2__ */
  float a, b, c, d, e, f, g, h;
-#  endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX2__ */

+#ifndef __KERNEL_GPU__
  __forceinline float operator[](int i) const;
  __forceinline float &operator[](int i);
+#endif
 };

-ccl_device_inline float8 make_float8(float f);
-ccl_device_inline float8
-make_float8(float a, float b, float c, float d, float e, float f, float g, float h);
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
+ccl_device_inline float8_t make_float8_t(float f);
+ccl_device_inline float8_t
+make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h);

 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT8_H__ */
--- a/intern/cycles/util/types_float8_impl.h
+++ b/intern/cycles/util/types_float8_impl.h
@@ -2,87 +2,79 @@
 * Original code Copyright 2017, Intel Corporation
 * Modifications Copyright 2018-2022 Blender Foundation. */

-#ifndef __UTIL_TYPES_FLOAT8_IMPL_H__
-#define __UTIL_TYPES_FLOAT8_IMPL_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
 #endif

-#ifndef __KERNEL_GPU__
-#  include <cstdio>
-#endif
-
 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
-#  ifdef __KERNEL_AVX2__
-__forceinline float8::float8()
+#ifdef __KERNEL_AVX2__
+__forceinline float8_t::float8_t()
 {
 }

-__forceinline float8::float8(const float8 &f) : m256(f.m256)
+__forceinline float8_t::float8_t(const float8_t &f) : m256(f.m256)
 {
 }

-__forceinline float8::float8(const __m256 &f) : m256(f)
+__forceinline float8_t::float8_t(const __m256 &f) : m256(f)
 {
 }

-__forceinline float8::operator const __m256 &() const
+__forceinline float8_t::operator const __m256 &() const
 {
  return m256;
 }

-__forceinline float8::operator __m256 &()
+__forceinline float8_t::operator __m256 &()
 {
  return m256;
 }

-__forceinline float8 &float8::operator=(const float8 &f)
+__forceinline float8_t &float8_t::operator=(const float8_t &f)
 {
  m256 = f.m256;
  return *this;
 }
-#  endif /* __KERNEL_AVX2__ */
+#endif /* __KERNEL_AVX2__ */

-__forceinline float float8::operator[](int i) const
+#ifndef __KERNEL_GPU__
+__forceinline float float8_t::operator[](int i) const
 {
  util_assert(i >= 0);
  util_assert(i < 8);
  return *(&a + i);
 }

-__forceinline float &float8::operator[](int i)
+__forceinline float &float8_t::operator[](int i)
 {
  util_assert(i >= 0);
  util_assert(i < 8);
  return *(&a + i);
 }
+#endif

-ccl_device_inline float8 make_float8(float f)
+ccl_device_inline float8_t make_float8_t(float f)
 {
-#  ifdef __KERNEL_AVX2__
-  float8 r(_mm256_set1_ps(f));
-#  else
-  float8 r = {f, f, f, f, f, f, f, f};
-#  endif
+#ifdef __KERNEL_AVX2__
+  float8_t r(_mm256_set1_ps(f));
+#else
+  float8_t r = {f, f, f, f, f, f, f, f};
+#endif
  return r;
 }

-ccl_device_inline float8
-make_float8(float a, float b, float c, float d, float e, float f, float g, float h)
+ccl_device_inline float8_t
+make_float8_t(float a, float b, float c, float d, float e, float f, float g, float h)
 {
-#  ifdef __KERNEL_AVX2__
-  float8 r(_mm256_set_ps(a, b, c, d, e, f, g, h));
-#  else
-  float8 r = {a, b, c, d, e, f, g, h};
-#  endif
+#ifdef __KERNEL_AVX2__
+  float8_t r(_mm256_setr_ps(a, b, c, d, e, f, g, h));
+#else
+  float8_t r = {a, b, c, d, e, f, g, h};
+#endif
  return r;
 }

-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
-
 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_FLOAT8_IMPL_H__ */
--- a/intern/cycles/util/types_int2.h
+++ b/intern/cycles/util/types_int2.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#ifndef __UTIL_TYPES_INT2_H__
-#define __UTIL_TYPES_INT2_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,17 +9,17 @@

 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct int2 {
  int x, y;

+#  ifndef __KERNEL_GPU__
  __forceinline int operator[](int i) const;
  __forceinline int &operator[](int i);
+#  endif
 };

 ccl_device_inline int2 make_int2(int x, int y);
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT2_H__ */
--- a/intern/cycles/util/types_int2_impl.h
+++ b/intern/cycles/util/types_int2_impl.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#ifndef __UTIL_TYPES_INT2_IMPL_H__
-#define __UTIL_TYPES_INT2_IMPL_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,7 +9,8 @@

 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
+#  ifndef __KERNEL_GPU__
 int int2::operator[](int i) const
 {
  util_assert(i >= 0);
@@ -24,14 +24,13 @@ int &int2::operator[](int i)
  util_assert(i < 2);
  return *(&x + i);
 }
+#  endif

 ccl_device_inline int2 make_int2(int x, int y)
 {
  int2 a = {x, y};
  return a;
 }
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT2_IMPL_H__ */
--- a/intern/cycles/util/types_int3.h
+++ b/intern/cycles/util/types_int3.h
@@ -1,8 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#ifndef __UTIL_TYPES_INT3_H__
-#define __UTIL_TYPES_INT3_H__
+#pragma once

 #ifndef __UTIL_TYPES_H__
 #  error "Do not include this file directly, include util/types.h instead."
@@ -10,10 +9,15 @@

 CCL_NAMESPACE_BEGIN

-#if !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__)
+#ifndef __KERNEL_NATIVE_VECTOR_TYPES__
 struct ccl_try_align(16) int3
 {
-#  ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_GPU__
+  /* Compact structure on the GPU. */
+  int x, y, z;
+#  else
+  /* SIMD aligned structure for CPU. */
+#    ifdef __KERNEL_SSE__
  union {
    __m128i m128;
    struct {
@@ -29,19 +33,20 @@ struct ccl_try_align(16) int3
  __forceinline operator __m128i &();

  __forceinline int3 &operator=(const int3 &a);
-#  else  /* __KERNEL_SSE__ */
+#    else  /* __KERNEL_SSE__ */
  int x, y, z, w;
-#  endif /* __KERNEL_SSE__ */
+#    endif /* __KERNEL_SSE__ */
+#  endif

+#  ifndef __KERNEL_GPU__
  __forceinline int operator[](int i) const;
  __forceinline int &operator[](int i);
+#  endif
 };

 ccl_device_inline int3 make_int3(int i);
 ccl_device_inline int3 make_int3(int x, int y, int z);
 ccl_device_inline void print_int3(const char *label, const int3 &a);
-#endif /* !defined(__KERNEL_GPU__) || defined(__KERNEL_ONEAPI__) */
+#endif /* __KERNEL_NATIVE_VECTOR_TYPES__ */

 CCL_NAMESPACE_END
-
-#endif /* __UTIL_TYPES_INT3_H__ */
--- a/Show More
+++ b/Show More