Merge branch 'main' into geometry-nodes-simulation

Geometry Nodes: add map from bsocket index to lf socket index
The socket indices in `bNode` and their corresponding `lf::Node` don't match exactly, because `lf::Node` does not contain the unavailable sockets. A simple mapping from `bNodeSocket` index to `lf::Socket` index is required for future work. For now it only removes the need for various tempory vectors.
2023-04-22 16:15:41 +02:00 · 2023-04-22 16:14:45 +02:00 · 2023-04-22 15:28:26 +02:00 · 2023-04-22 15:01:56 +02:00 · 2023-04-22 14:48:43 +02:00 · 2023-04-22 13:46:11 +02:00
1123 changed files with 29376 additions and 17810 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -521,7 +521,8 @@ endif()
 if(NOT APPLE)
  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" ON)
  option(WITH_CYCLES_HIP_BINARIES      "Build Cycles AMD HIP binaries" OFF)
-  set(CYCLES_HIP_BINARIES_ARCH gfx900 gfx906 gfx90c gfx902 gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
+  # Radeon VII (gfx906) not currently working with HIP SDK, so left out of the list.
+  set(CYCLES_HIP_BINARIES_ARCH gfx900 gfx90c gfx902 gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
  mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
  mark_as_advanced(CYCLES_HIP_BINARIES_ARCH)
 endif()
@@ -1580,6 +1581,8 @@ elseif(CMAKE_C_COMPILER_ID MATCHES "Clang")
  add_check_c_compiler_flag(C_REMOVE_STRICT_FLAGS C_WARN_NO_MISSING_NORETURN -Wno-missing-noreturn)
  add_check_c_compiler_flag(C_REMOVE_STRICT_FLAGS C_WARN_NO_UNUSED_BUT_SET_VARIABLE -Wno-unused-but-set-variable)
  add_check_c_compiler_flag(C_REMOVE_STRICT_FLAGS C_WARN_NO_DEPRECATED_DECLARATIONS -Wno-deprecated-declarations)
+  add_check_c_compiler_flag(C_REMOVE_STRICT_FLAGS C_WARN_NO_STRICT_PROTOTYPES -Wno-strict-prototypes)
+  add_check_c_compiler_flag(C_REMOVE_STRICT_FLAGS C_WARN_NO_BITWISE_INSTEAD_OF_LOGICAL -Wno-bitwise-instead-of-logical)

  add_check_cxx_compiler_flag(CXX_REMOVE_STRICT_FLAGS CXX_WARN_NO_UNUSED_PARAMETER -Wno-unused-parameter)
  add_check_cxx_compiler_flag(CXX_REMOVE_STRICT_FLAGS CXX_WARN_NO_UNUSED_PRIVATE_FIELD -Wno-unused-private-field)
@@ -1593,6 +1596,7 @@ elseif(CMAKE_C_COMPILER_ID MATCHES "Clang")
  add_check_cxx_compiler_flag(CXX_REMOVE_STRICT_FLAGS CXX_WARN_NO_UNDEFINED_VAR_TEMPLATE -Wno-undefined-var-template)
  add_check_cxx_compiler_flag(CXX_REMOVE_STRICT_FLAGS CXX_WARN_NO_INSTANTIATION_AFTER_SPECIALIZATION -Wno-instantiation-after-specialization)
  add_check_cxx_compiler_flag(CXX_REMOVE_STRICT_FLAGS CXX_WARN_NO_MISLEADING_INDENTATION    -Wno-misleading-indentation)
+  add_check_cxx_compiler_flag(CXX_REMOVE_STRICT_FLAGS CXX_WARN_NO_BITWISE_INSTEAD_OF_LOGICAL -Wno-bitwise-instead-of-logical)

 elseif(CMAKE_C_COMPILER_ID MATCHES "Intel")

--- a/18
+++ b/18
@@ -58,9 +58,6 @@ Static Source Code Checking
   * check_cppcheck:        Run blender source through cppcheck (C & C++).
   * check_clang_array:     Run blender source through clang array checking script (C & C++).
   * check_deprecated:      Check if there is any deprecated code to remove.
-   * check_splint:          Run blenders source through splint (C only).
-   * check_sparse:          Run blenders source through sparse (C only).
-   * check_smatch:          Run blenders source through smatch (C only).
   * check_descriptions:    Check for duplicate/invalid descriptions.
   * check_licenses:        Check license headers follow the SPDX license specification,
                            using one of the accepted licenses in 'doc/license/SPDX-license-identifiers.txt'
@@ -474,21 +471,6 @@ check_clang_array: .FORCE
 	@cd "$(BUILD_DIR)" ; \
 	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_clang_array.py"

-check_splint: .FORCE
-	@$(CMAKE_CONFIG)
-	@cd "$(BUILD_DIR)" ; \
-	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_splint.py"
-
-check_sparse: .FORCE
-	@$(CMAKE_CONFIG)
-	@cd "$(BUILD_DIR)" ; \
-	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_sparse.py"
-
-check_smatch: .FORCE
-	@$(CMAKE_CONFIG)
-	@cd "$(BUILD_DIR)" ; \
-	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_smatch.py"
-
 check_mypy: .FORCE
 	@$(PYTHON) "$(BLENDER_DIR)/tools/check_source/check_mypy.py"

--- a/build_files/build_environment/CMakeLists.txt
+++ b/build_files/build_environment/CMakeLists.txt
@@ -90,28 +90,26 @@ include(cmake/haru.cmake)
 # Boost needs to be included after `python.cmake` due to the PYTHON_BINARY variable being needed.
 include(cmake/boost.cmake)
 include(cmake/pugixml.cmake)
-include(cmake/ispc.cmake)
-include(cmake/openimagedenoise.cmake)
-include(cmake/embree.cmake)
-include(cmake/openpgl.cmake)
-include(cmake/fmt.cmake)
-include(cmake/robinmap.cmake)
-include(cmake/xml2.cmake)
-
 include(cmake/fribidi.cmake)
 include(cmake/harfbuzz.cmake)
 if(NOT APPLE)
  include(cmake/xr_openxr.cmake)
-  if(NOT WIN32 OR BUILD_MODE STREQUAL Release)
-    include(cmake/dpcpp.cmake)
-    include(cmake/dpcpp_deps.cmake)
-  endif()
+  include(cmake/dpcpp.cmake)
+  include(cmake/dpcpp_deps.cmake)
  if(NOT WIN32)
    include(cmake/igc.cmake)
    include(cmake/gmmlib.cmake)
    include(cmake/ocloc.cmake)
  endif()
 endif()
+include(cmake/ispc.cmake)
+include(cmake/openimagedenoise.cmake)
+# Embree needs to be included after dpcpp as it uses it for compiling with GPU support
+include(cmake/embree.cmake)
+include(cmake/openpgl.cmake)
+include(cmake/fmt.cmake)
+include(cmake/robinmap.cmake)
+include(cmake/xml2.cmake)

 # OpenColorIO and dependencies.
 include(cmake/expat.cmake)
--- a/build_files/build_environment/cmake/download.cmake
+++ b/build_files/build_environment/cmake/download.cmake
@@ -156,6 +156,7 @@ download_source(OPENCLHEADERS)
 download_source(ICDLOADER)
 download_source(MP11)
 download_source(SPIRV_HEADERS)
+download_source(UNIFIED_RUNTIME)
 download_source(IGC)
 download_source(IGC_LLVM)
 download_source(IGC_OPENCL_CLANG)
--- a/build_files/build_environment/cmake/dpcpp.cmake
+++ b/build_files/build_environment/cmake/dpcpp.cmake
@@ -5,6 +5,9 @@
 # for now.
 string(REPLACE "-DCMAKE_CXX_STANDARD=17" " " DPCPP_CMAKE_FLAGS "${DEFAULT_CMAKE_FLAGS}")

+# DPCPP already generates debug libs, there isn't much point in compiling it in debug mode itself.
+string(REPLACE "-DCMAKE_BUILD_TYPE=Debug" "-DCMAKE_BUILD_TYPE=Release" DPCPP_CMAKE_FLAGS "${DPCPP_CMAKE_FLAGS}")
+
 if(WIN32)
  set(LLVM_GENERATOR "Ninja")
 else()
@@ -38,17 +41,18 @@ set(DPCPP_EXTRA_ARGS
  -DLEVEL_ZERO_LIBRARY=${LIBDIR}/level-zero/lib/${LIBPREFIX}ze_loader${SHAREDLIBEXT}
  -DLEVEL_ZERO_INCLUDE_DIR=${LIBDIR}/level-zero/include
  -DLLVM_EXTERNAL_SPIRV_HEADERS_SOURCE_DIR=${BUILD_DIR}/spirvheaders/src/external_spirvheaders/
+  -DUNIFIED_RUNTIME_SOURCE_DIR=${BUILD_DIR}/unifiedruntime/src/external_unifiedruntime/
  # Below here is copied from an invocation of buildbot/config.py
  -DLLVM_ENABLE_ASSERTIONS=ON
  -DLLVM_TARGETS_TO_BUILD=X86
-  -DLLVM_EXTERNAL_PROJECTS=sycl^^llvm-spirv^^opencl^^libdevice^^xpti^^xptifw
+  -DLLVM_EXTERNAL_PROJECTS=sycl^^llvm-spirv^^opencl^^libdevice^^xpti^^xptifw^^lld
  -DLLVM_EXTERNAL_SYCL_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/sycl
  -DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/llvm-spirv
  -DLLVM_EXTERNAL_XPTI_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/xpti
  -DXPTI_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/xpti
  -DLLVM_EXTERNAL_XPTIFW_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/xptifw
  -DLLVM_EXTERNAL_LIBDEVICE_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/libdevice
-  -DLLVM_ENABLE_PROJECTS=clang^^sycl^^llvm-spirv^^opencl^^libdevice^^xpti^^xptifw
+  -DLLVM_ENABLE_PROJECTS=clang^^sycl^^llvm-spirv^^opencl^^libdevice^^xpti^^xptifw^^lld
  -DLIBCLC_TARGETS_TO_BUILD=
  -DLIBCLC_GENERATE_REMANGLED_VARIANTS=OFF
  -DSYCL_BUILD_PI_HIP_PLATFORM=AMD
@@ -104,13 +108,19 @@ add_dependencies(
  external_mp11
  external_level-zero
  external_spirvheaders
+  external_unifiedruntime
 )

 if(BUILD_MODE STREQUAL Release AND WIN32)
  ExternalProject_Add_Step(external_dpcpp after_install
-      COMMAND ${CMAKE_COMMAND} -E rm -f ${LIBDIR}/dpcpp/bin/clang-cl.exe
-      COMMAND ${CMAKE_COMMAND} -E rm -f ${LIBDIR}/dpcpp/bin/clang-cpp.exe
-      COMMAND ${CMAKE_COMMAND} -E rm -f ${LIBDIR}/dpcpp/bin/clang.exe
      COMMAND ${CMAKE_COMMAND} -E copy_directory ${LIBDIR}/dpcpp ${HARVEST_TARGET}/dpcpp
+      COMMAND ${CMAKE_COMMAND} -E rm -f ${HARVEST_TARGET}/dpcpp/bin/clang-cl.exe
+      COMMAND ${CMAKE_COMMAND} -E rm -f ${HARVEST_TARGET}/dpcpp/bin/clang-cpp.exe
+      COMMAND ${CMAKE_COMMAND} -E rm -f ${HARVEST_TARGET}/dpcpp/bin/clang.exe
+      COMMAND ${CMAKE_COMMAND} -E rm -f ${HARVEST_TARGET}/dpcpp/bin/ld.lld.exe
+      COMMAND ${CMAKE_COMMAND} -E rm -f ${HARVEST_TARGET}/dpcpp/bin/ld64.lld.exe
+      COMMAND ${CMAKE_COMMAND} -E rm -f ${HARVEST_TARGET}/dpcpp/bin/lld.exe
+      COMMAND ${CMAKE_COMMAND} -E rm -f ${HARVEST_TARGET}/dpcpp/bin/lld-link.exe
+      COMMAND ${CMAKE_COMMAND} -E rm -f ${HARVEST_TARGET}/dpcpp/bin/wasm-ld.exe
  )
 endif()
--- a/build_files/build_environment/cmake/dpcpp_deps.cmake
+++ b/build_files/build_environment/cmake/dpcpp_deps.cmake
@@ -59,3 +59,13 @@ ExternalProject_Add(external_spirvheaders
  BUILD_COMMAND echo .
  INSTALL_COMMAND echo .
 )
+
+ExternalProject_Add(external_unifiedruntime
+  URL file://${PACKAGE_DIR}/${UNIFIED_RUNTIME_FILE}
+  URL_HASH ${UNIFIED_RUNTIME_HASH_TYPE}=${UNIFIED_RUNTIME_HASH}
+  DOWNLOAD_DIR ${DOWNLOAD_DIR}
+  PREFIX ${BUILD_DIR}/unifiedruntime
+  CONFIGURE_COMMAND echo .
+  BUILD_COMMAND echo .
+  INSTALL_COMMAND echo .
+)
--- a/build_files/build_environment/cmake/embree.cmake
+++ b/build_files/build_environment/cmake/embree.cmake
@@ -3,6 +3,8 @@
 # Note the utility apps may use png/tiff/gif system libraries, but the
 # library itself does not depend on them, so should give no problems.

+set(EMBREE_CMAKE_FLAGS ${DEFAULT_CMAKE_FLAGS})
+
 set(EMBREE_EXTRA_ARGS
  -DEMBREE_ISPC_SUPPORT=OFF
  -DEMBREE_TUTORIALS=OFF
@@ -31,6 +33,43 @@ if(NOT BLENDER_PLATFORM_ARM)
  )
 endif()

+if(NOT APPLE)
+  if(WIN32)
+    # Levels below -O2 don't work well for Embree+SYCL.
+    string(REGEX REPLACE "-O[A-Za-z0-9]" "" EMBREE_CLANG_CMAKE_CXX_FLAGS_DEBUG ${BLENDER_CLANG_CMAKE_C_FLAGS_DEBUG})
+    string(APPEND EMBREE_CLANG_CMAKE_CXX_FLAGS_DEBUG " -O2")
+    string(REGEX REPLACE "-O[A-Za-z0-9]" "" EMBREE_CLANG_CMAKE_C_FLAGS_DEBUG ${BLENDER_CLANG_CMAKE_C_FLAGS_DEBUG})
+    string(APPEND EMBREE_CLANG_CMAKE_C_FLAGS_DEBUG " -O2")
+    set(EMBREE_CMAKE_FLAGS
+      -DCMAKE_BUILD_TYPE=${BUILD_MODE}
+      -DCMAKE_CXX_FLAGS_RELEASE=${BLENDER_CLANG_CMAKE_CXX_FLAGS_RELEASE}
+      -DCMAKE_CXX_FLAGS_MINSIZEREL=${BLENDER_CLANG_CMAKE_CXX_FLAGS_MINSIZEREL}
+      -DCMAKE_CXX_FLAGS_RELWITHDEBINFO=${BLENDER_CLANG_CMAKE_CXX_FLAGS_RELWITHDEBINFO}
+      -DCMAKE_CXX_FLAGS_DEBUG=${EMBREE_CLANG_CMAKE_CXX_FLAGS_DEBUG}
+      -DCMAKE_C_FLAGS_RELEASE=${BLENDER_CLANG_CMAKE_C_FLAGS_RELEASE}
+      -DCMAKE_C_FLAGS_MINSIZEREL=${BLENDER_CLANG_CMAKE_C_FLAGS_MINSIZEREL}
+      -DCMAKE_C_FLAGS_RELWITHDEBINFO=${BLENDER_CLANG_CMAKE_C_FLAGS_RELWITHDEBINFO}
+      -DCMAKE_C_FLAGS_DEBUG=${EMBREE_CLANG_CMAKE_C_FLAGS_DEBUG}
+      -DCMAKE_CXX_STANDARD=17
+    )
+    set(EMBREE_EXTRA_ARGS
+      -DCMAKE_CXX_COMPILER=${LIBDIR}/dpcpp/bin/clang++.exe
+      -DCMAKE_C_COMPILER=${LIBDIR}/dpcpp/bin/clang.exe
+      -DCMAKE_SHARED_LINKER_FLAGS=-L"${LIBDIR}/dpcpp/lib"
+      -DEMBREE_SYCL_SUPPORT=ON
+      ${EMBREE_EXTRA_ARGS}
+    )
+  else()
+    set(EMBREE_EXTRA_ARGS
+      -DCMAKE_CXX_COMPILER=${LIBDIR}/dpcpp/bin/clang++
+      -DCMAKE_C_COMPILER=${LIBDIR}/dpcpp/bin/clang
+      -DCMAKE_SHARED_LINKER_FLAGS=-L"${LIBDIR}/dpcpp/lib"
+      -DEMBREE_SYCL_SUPPORT=ON
+      ${EMBREE_EXTRA_ARGS}
+    )
+  endif()
+endif()
+
 if(TBB_STATIC_LIBRARY)
  set(EMBREE_EXTRA_ARGS
    ${EMBREE_EXTRA_ARGS}
@@ -42,16 +81,25 @@ ExternalProject_Add(external_embree
  URL file://${PACKAGE_DIR}/${EMBREE_FILE}
  DOWNLOAD_DIR ${DOWNLOAD_DIR}
  URL_HASH ${EMBREE_HASH_TYPE}=${EMBREE_HASH}
+  CMAKE_GENERATOR ${PLATFORM_ALT_GENERATOR}
  PREFIX ${BUILD_DIR}/embree
  PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/embree/src/external_embree < ${PATCH_DIR}/embree.diff
-  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/embree ${DEFAULT_CMAKE_FLAGS} ${EMBREE_EXTRA_ARGS}
+  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/embree ${EMBREE_CMAKE_FLAGS} ${EMBREE_EXTRA_ARGS}
  INSTALL_DIR ${LIBDIR}/embree
 )

-add_dependencies(
-  external_embree
-  external_tbb
-)
+if(NOT APPLE)
+  add_dependencies(
+    external_embree
+    external_tbb
+    external_dpcpp
+  )
+else()
+  add_dependencies(
+    external_embree
+    external_tbb
+  )
+endif()

 if(WIN32)
  if(BUILD_MODE STREQUAL Release)
@@ -66,6 +114,7 @@ if(WIN32)
    ExternalProject_Add_Step(external_embree after_install
      COMMAND ${CMAKE_COMMAND} -E copy ${LIBDIR}/embree/bin/embree4_d.dll ${HARVEST_TARGET}/embree/bin/embree4_d.dll
      COMMAND ${CMAKE_COMMAND} -E copy ${LIBDIR}/embree/lib/embree4_d.lib ${HARVEST_TARGET}/embree/lib/embree4_d.lib
+      COMMAND ${CMAKE_COMMAND} -E copy ${LIBDIR}/embree/lib/embree4_sycl_d.lib ${HARVEST_TARGET}/embree/lib/embree4_sycl_d.lib
      DEPENDEES install
    )
  endif()
--- a/build_files/build_environment/cmake/options.cmake
+++ b/build_files/build_environment/cmake/options.cmake
@@ -74,6 +74,27 @@ if(WIN32)
  set(BLENDER_CMAKE_CXX_FLAGS_RELEASE "/MD ${COMMON_MSVC_FLAGS} /D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS /O2 /Ob2 /D NDEBUG /D PLATFORM_WINDOWS /DPSAPI_VERSION=2 /DTINYFORMAT_ALLOW_WCHAR_STRINGS")
  set(BLENDER_CMAKE_CXX_FLAGS_RELWITHDEBINFO "/MD ${COMMON_MSVC_FLAGS} /D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS /Zi /O2 /Ob1 /D NDEBUG /D PLATFORM_WINDOWS /DPSAPI_VERSION=2 /DTINYFORMAT_ALLOW_WCHAR_STRINGS")

+  # Set similar flags for CLANG compilation.
+  set(COMMON_CLANG_FLAGS "-D_DLL -D_MT") # Equivalent to MSVC /MD
+
+  if(WITH_OPTIMIZED_DEBUG)
+    set(BLENDER_CLANG_CMAKE_C_FLAGS_DEBUG "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrtd -O2 -D_DEBUG -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
+  else()
+    set(BLENDER_CLANG_CMAKE_C_FLAGS_DEBUG "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrtd -g -D_DEBUG -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
+  endif()
+  set(BLENDER_CLANG_CMAKE_C_FLAGS_MINSIZEREL "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -Os -DNDEBUG -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
+  set(BLENDER_CLANG_CMAKE_C_FLAGS_RELEASE "${COMMON_CLANG_FLAGS}  -Xclang --dependent-lib=msvcrt -O2 -DNDEBUG -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
+  set(BLENDER_CLANG_CMAKE_C_FLAGS_RELWITHDEBINFO "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -g -O2 -DNDEBUG -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
+
+  if(WITH_OPTIMIZED_DEBUG)
+    set(BLENDER_CLANG_CMAKE_CXX_FLAGS_DEBUG "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrtd -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS -O2 -D_DEBUG -DPLATFORM_WINDOWS -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS -DBOOST_DEBUG_PYTHON -DBOOST_ALL_NO_LIB")
+  else()
+    set(BLENDER_CLANG_CMAKE_CXX_FLAGS_DEBUG "${COMMON_CLANG_FLAG} -Xclang --dependent-lib=msvcrtd -D_DEBUG -DPLATFORM_WINDOWS -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS -g -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS -DBOOST_DEBUG_PYTHON -DBOOST_ALL_NO_LIB")
+  endif()
+  set(BLENDER_CLANG_CMAKE_CXX_FLAGS_MINSIZEREL "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS -O2 -DNDEBUG  -DPLATFORM_WINDOWS -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
+  set(BLENDER_CLANG_CMAKE_CXX_FLAGS_RELEASE "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS -O2 -DNDEBUG -DPLATFORM_WINDOWS -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
+  set(BLENDER_CLANG_CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS -g -O2 -DNDEBUG -DPLATFORM_WINDOWS -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
+
  set(PLATFORM_FLAGS)
  set(PLATFORM_CXX_FLAGS)
  set(PLATFORM_CMAKE_FLAGS)
--- a/build_files/build_environment/cmake/versions.cmake
+++ b/build_files/build_environment/cmake/versions.cmake
@@ -599,15 +599,15 @@ set(OPENPGL_HASH db63f5dac5cfa8c110ede241f0c413f00db0c4748697381c4fa23e0f9e82a75
 set(OPENPGL_HASH_TYPE SHA256)
 set(OPENPGL_FILE openpgl-${OPENPGL_VERSION}.tar.gz)

-set(LEVEL_ZERO_VERSION v1.8.5)
+set(LEVEL_ZERO_VERSION v1.8.8)
 set(LEVEL_ZERO_URI https://github.com/oneapi-src/level-zero/archive/refs/tags/${LEVEL_ZERO_VERSION}.tar.gz)
-set(LEVEL_ZERO_HASH b6e9663bbcc53c148d32376998298bec6f7c434ef2218c61fa708963e3a09394)
+set(LEVEL_ZERO_HASH 3553ae8fa0d2d69c4210a8f3428bd6612bd8bb8a627faf52c3658a01851e66d2)
 set(LEVEL_ZERO_HASH_TYPE SHA256)
 set(LEVEL_ZERO_FILE level-zero-${LEVEL_ZERO_VERSION}.tar.gz)

-set(DPCPP_VERSION 20221019)
-set(DPCPP_URI https://github.com/intel/llvm/archive/refs/tags/sycl-nightly/${DPCPP_VERSION}.tar.gz)
-set(DPCPP_HASH 2f533946e91ce3829431758ea17b0b834b960c1a796e9e4563c86e03eb9603a2)
+set(DPCPP_VERSION 2022-12)
+set(DPCPP_URI https://github.com/intel/llvm/archive/refs/tags/${DPCPP_VERSION}.tar.gz)
+set(DPCPP_HASH 13151d5ae79f7c9c4a9b072a0c486ae7b3c4993e301bb1268c92214451025790)
 set(DPCPP_HASH_TYPE SHA256)
 set(DPCPP_FILE DPCPP-${DPCPP_VERSION}.tar.gz)

@@ -620,9 +620,9 @@ set(DPCPP_FILE DPCPP-${DPCPP_VERSION}.tar.gz)
 # will take care of building them, unpack is being done in dpcpp_deps.cmake

 # Source llvm/lib/SYCLLowerIR/CMakeLists.txt
-set(VCINTRINSICS_VERSION abce9184b7a3a7fe1b02289b9285610d9dc45465)
+set(VCINTRINSICS_VERSION 782fbf7301dc73acaa049a4324c976ad94f587f7)
 set(VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/${VCINTRINSICS_VERSION}.tar.gz)
-set(VCINTRINSICS_HASH 3e9fd471246b87633b26f7e15e17ab7733d357458c53d5c5881c03929d6c551f)
+set(VCINTRINSICS_HASH f4c0ccad8c1f77760364c551c65e8e1cf194d058889fa46d3b1b2d19ec4dc33f)
 set(VCINTRINSICS_HASH_TYPE SHA256)
 set(VCINTRINSICS_FILE vc-intrinsics-${VCINTRINSICS_VERSION}.tar.gz)

@@ -657,6 +657,13 @@ set(SPIRV_HEADERS_HASH ec8ecb471a62672697846c436501638ab25447ae9d4a6761e0bfe8a9a
 set(SPIRV_HEADERS_HASH_TYPE SHA256)
 set(SPIRV_HEADERS_FILE SPIR-V-Headers-${SPIRV_HEADERS_VERSION}.tar.gz)

+# Source llvm/sycl/plugins/unified_runtime/CMakeLists.txt
+set(UNIFIED_RUNTIME_VERSION fd711c920acc4434cb52ff18b078c082d9d7f44d)
+set(UNIFIED_RUNTIME_URI https://github.com/oneapi-src/unified-runtime/archive/${UNIFIED_RUNTIME_VERSION}.tar.gz)
+set(UNIFIED_RUNTIME_HASH 535ca2ee78f68c5e7e62b10f1bbabd909179488885566e6d9b1fc50e8a1be65f)
+set(UNIFIED_RUNTIME_HASH_TYPE SHA256)
+set(UNIFIED_RUNTIME_FILE unified-runtime-${UNIFIED_RUNTIME_VERSION}.tar.gz)
+
 ######################
 ### DPCPP DEPS END ###
 ######################
@@ -730,9 +737,9 @@ set(GMMLIB_HASH c1f33e1519edfc527127baeb0436b783430dfd256c643130169a3a71dc86aff9
 set(GMMLIB_HASH_TYPE SHA256)
 set(GMMLIB_FILE ${GMMLIB_VERSION}.tar.gz)

-set(OCLOC_VERSION 22.49.25018.21)
+set(OCLOC_VERSION 23.05.25593.18)
 set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC_VERSION}.tar.gz)
-set(OCLOC_HASH 92362dae08b503a34e5d3820ed284198c452bcd5e7504d90eb69887b20492c06)
+set(OCLOC_HASH 122415028e631922ae999c996954dfd98ce9a32decd564d5484c31476ec9306e)
 set(OCLOC_HASH_TYPE SHA256)
 set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)

--- a/build_files/build_environment/dependencies.dot
+++ b/build_files/build_environment/dependencies.dot
@@ -14,6 +14,7 @@ graph[autosize = false, size = "25.7,8.3!", resolution = 300];
 	external_dpcpp -- external_mp11;
 	external_dpcpp -- external_level_zero;
 	external_dpcpp -- external_spirvheaders;
+	external_dpcpp -- external_unifiedruntime;
 	external_embree -- external_tbb;
 	external_ffmpeg -- external_zlib;
 	external_ffmpeg -- external_openjpeg;
--- a/build_files/build_environment/patches/dpcpp.diff
+++ b/build_files/build_environment/patches/dpcpp.diff
@@ -34,3 +34,156 @@ diff -Naur llvm-sycl-nightly-20220208.orig/libdevice/cmake/modules/SYCLLibdevice
   libsycldevice-obj
   libsycldevice-spv)
 
+diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
+index 17eeaafae194..09e6d2217aaa 100644
+--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
+@@ -1647,46 +1647,120 @@ ProgramManager::getSYCLDeviceImagesWithCompatibleState(
+   }
+   assert(BinImages.size() > 0 && "Expected to find at least one device image");
+ 
+  // Ignore images with incompatible state. Image is considered compatible
+  // with a target state if an image is already in the target state or can
+  // be brought to target state by compiling/linking/building.
+  //
+  // Example: an image in "executable" state is not compatible with
+  // "input" target state - there is no operation to convert the image it
+  // to "input" state. An image in "input" state is compatible with
+  // "executable" target state because it can be built to get into
+  // "executable" state.
+  for (auto It = BinImages.begin(); It != BinImages.end();) {
+    if (getBinImageState(*It) > TargetState)
+      It = BinImages.erase(It);
+    else
+      ++It;
+  }
+
+   std::vector<device_image_plain> SYCLDeviceImages;
+-  for (RTDeviceBinaryImage *BinImage : BinImages) {
+-    const bundle_state ImgState = getBinImageState(BinImage);
+-
+-    // Ignore images with incompatible state. Image is considered compatible
+-    // with a target state if an image is already in the target state or can
+-    // be brought to target state by compiling/linking/building.
+-    //
+-    // Example: an image in "executable" state is not compatible with
+-    // "input" target state - there is no operation to convert the image it
+-    // to "input" state. An image in "input" state is compatible with
+-    // "executable" target state because it can be built to get into
+-    // "executable" state.
+-    if (ImgState > TargetState)
+-      continue;
+ 
+-    for (const sycl::device &Dev : Devs) {
+  // If a non-input state is requested, we can filter out some compatible
+  // images and return only those with the highest compatible state for each
+  // device-kernel pair. This map tracks how many kernel-device pairs need each
+  // image, so that any unneeded ones are skipped.
+  // TODO this has no effect if the requested state is input, consider having
+  // a separate branch for that case to avoid unnecessary tracking work.
+  struct DeviceBinaryImageInfo {
+    std::shared_ptr<std::vector<sycl::kernel_id>> KernelIDs;
+    bundle_state State = bundle_state::input;
+    int RequirementCounter = 0;
+  };
+  std::unordered_map<RTDeviceBinaryImage *, DeviceBinaryImageInfo> ImageInfoMap;
+
+  for (const sycl::device &Dev : Devs) {
+    // Track the highest image state for each requested kernel.
+    using StateImagesPairT =
+        std::pair<bundle_state, std::vector<RTDeviceBinaryImage *>>;
+    using KernelImageMapT =
+        std::map<kernel_id, StateImagesPairT, LessByNameComp>;
+    KernelImageMapT KernelImageMap;
+    if (!KernelIDs.empty())
+      for (const kernel_id &KernelID : KernelIDs)
+        KernelImageMap.insert({KernelID, {}});
+
+    for (RTDeviceBinaryImage *BinImage : BinImages) {
+       if (!compatibleWithDevice(BinImage, Dev) ||
+           !doesDevSupportImgAspects(Dev, *BinImage))
+         continue;
+ 
+-      std::shared_ptr<std::vector<sycl::kernel_id>> KernelIDs;
+-      // Collect kernel names for the image
+-      {
+-        std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
+-        KernelIDs = m_BinImg2KernelIDs[BinImage];
+-        // If the image does not contain any non-service kernels we can skip it.
+-        if (!KernelIDs || KernelIDs->empty())
+-          continue;
+      auto InsertRes = ImageInfoMap.insert({BinImage, {}});
+      DeviceBinaryImageInfo &ImgInfo = InsertRes.first->second;
+      if (InsertRes.second) {
+        ImgInfo.State = getBinImageState(BinImage);
+        // Collect kernel names for the image
+        {
+          std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
+          ImgInfo.KernelIDs = m_BinImg2KernelIDs[BinImage];
+        }
+       }
+      const bundle_state ImgState = ImgInfo.State;
+      const std::shared_ptr<std::vector<sycl::kernel_id>> &ImageKernelIDs =
+          ImgInfo.KernelIDs;
+      int &ImgRequirementCounter = ImgInfo.RequirementCounter;
+ 
+-      DeviceImageImplPtr Impl = std::make_shared<detail::device_image_impl>(
+-          BinImage, Ctx, Devs, ImgState, KernelIDs, /*PIProgram=*/nullptr);
+      // If the image does not contain any non-service kernels we can skip it.
+      if (!ImageKernelIDs || ImageKernelIDs->empty())
+        continue;
+ 
+-      SYCLDeviceImages.push_back(
+-          createSyclObjFromImpl<device_image_plain>(Impl));
+-      break;
+      // Update tracked information.
+      for (kernel_id &KernelID : *ImageKernelIDs) {
+        StateImagesPairT *StateImagesPair;
+        // If only specific kernels are requested, ignore the rest.
+        if (!KernelIDs.empty()) {
+          auto It = KernelImageMap.find(KernelID);
+          if (It == KernelImageMap.end())
+            continue;
+          StateImagesPair = &It->second;
+        } else
+          StateImagesPair = &KernelImageMap[KernelID];
+
+        auto &[KernelImagesState, KernelImages] = *StateImagesPair;
+
+        if (KernelImages.empty()) {
+          KernelImagesState = ImgState;
+          KernelImages.push_back(BinImage);
+          ++ImgRequirementCounter;
+        } else if (KernelImagesState < ImgState) {
+          for (RTDeviceBinaryImage *Img : KernelImages) {
+            auto It = ImageInfoMap.find(Img);
+            assert(It != ImageInfoMap.end());
+            assert(It->second.RequirementCounter > 0);
+            --(It->second.RequirementCounter);
+          }
+          KernelImages.clear();
+          KernelImages.push_back(BinImage);
+          KernelImagesState = ImgState;
+          ++ImgRequirementCounter;
+        } else if (KernelImagesState == ImgState) {
+          KernelImages.push_back(BinImage);
+          ++ImgRequirementCounter;
+        }
+      }
+     }
+   }
+ 
+  for (const auto &ImgInfoPair : ImageInfoMap) {
+    if (ImgInfoPair.second.RequirementCounter == 0)
+      continue;
+
+    DeviceImageImplPtr Impl = std::make_shared<detail::device_image_impl>(
+        ImgInfoPair.first, Ctx, Devs, ImgInfoPair.second.State,
+        ImgInfoPair.second.KernelIDs, /*PIProgram=*/nullptr);
+
+    SYCLDeviceImages.push_back(createSyclObjFromImpl<device_image_plain>(Impl));
+  }
+
+   return SYCLDeviceImages;
+ }
+ 
--- a/build_files/build_environment/patches/embree.diff
+++ b/build_files/build_environment/patches/embree.diff
@@ -149,3 +149,19 @@ index 074f910a2..30f490818 100644
         return is_hit_first | is_hit_second;
       }
     };
+diff -ruN a/kernels/sycl/rthwif_embree_builder.cpp b/kernels/sycl/rthwif_embree_builder.cpp
+--- a/kernels/sycl/rthwif_embree_builder.cpp    2023-03-28 17:23:06.429190200 +0200
+++ b/kernels/sycl/rthwif_embree_builder.cpp    2023-03-28 17:35:01.291938600 +0200
+@@ -540,7 +540,12 @@
+       assert(offset <= geomDescrData.size());
+     }
+
+    /* Force running BVH building sequentially from the calling thread if using TBB < 2021, as it otherwise leads to runtime issues. */
+#if TBB_VERSION_MAJOR<2021
+    RTHWIF_PARALLEL_OPERATION parallelOperation = nullptr;
+#else
+     RTHWIF_PARALLEL_OPERATION parallelOperation = rthwifNewParallelOperation();
+#endif
+
+     /* estimate static accel size */
+     BBox1f time_range(0,1);
--- a/build_files/cmake/Modules/FindHIP.cmake
+++ b/build_files/cmake/Modules/FindHIP.cmake
@@ -37,18 +37,24 @@ elseif(HIP_HIPCC_EXECUTABLE)
  set(HIP_VERSION_MINOR 0)
  set(HIP_VERSION_PATCH 0)

+  if(WIN32)
+    set(_hipcc_executable ${HIP_HIPCC_EXECUTABLE}.bat)
+  else()
+    set(_hipcc_executable ${HIP_HIPCC_EXECUTABLE})
+  endif()
+
  # Get version from the output.
-  execute_process(COMMAND ${HIP_HIPCC_EXECUTABLE} --version
-                  OUTPUT_VARIABLE HIP_VERSION_RAW
+  execute_process(COMMAND ${_hipcc_executable} --version
+                  OUTPUT_VARIABLE _hip_version_raw
                  ERROR_QUIET
                  OUTPUT_STRIP_TRAILING_WHITESPACE)

  # Parse parts.
-  if(HIP_VERSION_RAW MATCHES "HIP version: .*")
+  if(_hip_version_raw MATCHES "HIP version: .*")
    # Strip the HIP prefix and get list of individual version components.
    string(REGEX REPLACE
           ".*HIP version: ([.0-9]+).*" "\\1"
-           HIP_SEMANTIC_VERSION "${HIP_VERSION_RAW}")
+           HIP_SEMANTIC_VERSION "${_hip_version_raw}")
    string(REPLACE "." ";" HIP_VERSION_PARTS "${HIP_SEMANTIC_VERSION}")
    list(LENGTH HIP_VERSION_PARTS NUM_HIP_VERSION_PARTS)

@@ -71,7 +77,13 @@ elseif(HIP_HIPCC_EXECUTABLE)

  # Construct full semantic version.
  set(HIP_VERSION "${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR}.${HIP_VERSION_PATCH}")
-  unset(HIP_VERSION_RAW)
+  unset(_hip_version_raw)
+  unset(_hipcc_executable)
 else()
  set(HIP_FOUND FALSE)
 endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(HIP
+    REQUIRED_VARS HIP_HIPCC_EXECUTABLE
+    VERSION_VAR HIP_VERSION)
--- a/build_files/cmake/Modules/FindSYCL.cmake
+++ b/build_files/cmake/Modules/FindSYCL.cmake
@@ -108,7 +108,11 @@ FIND_PACKAGE_HANDLE_STANDARD_ARGS(SYCL

 IF(SYCL_FOUND)
  SET(SYCL_INCLUDE_DIR ${SYCL_INCLUDE_DIR} ${SYCL_INCLUDE_DIR}/sycl)
-  SET(SYCL_LIBRARIES ${SYCL_LIBRARY})
+  IF(WIN32 AND SYCL_LIBRARY_DEBUG)
+    SET(SYCL_LIBRARIES optimized ${SYCL_LIBRARY} debug ${SYCL_LIBRARY_DEBUG})
+  ELSE()
+    SET(SYCL_LIBRARIES ${SYCL_LIBRARY})
+  ENDIF()
 ELSE()
  SET(SYCL_SYCL_FOUND FALSE)
 ENDIF()
--- a/build_files/cmake/cmake_static_check_smatch.py
+++ b/build_files/cmake/cmake_static_check_smatch.py
@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0-or-later
-
-CHECKER_IGNORE_PREFIX = [
-    "extern",
-    "intern/moto",
-]
-
-CHECKER_BIN = "smatch"
-CHECKER_ARGS = [
-    "--full-path",
-    "--two-passes",
-]
-
-import project_source_info
-import subprocess
-import sys
-import os
-
-USE_QUIET = (os.environ.get("QUIET", None) is not None)
-
-
-def main():
-    source_info = project_source_info.build_info(use_cxx=False, ignore_prefix_list=CHECKER_IGNORE_PREFIX)
-    source_defines = project_source_info.build_defines_as_args()
-
-    check_commands = []
-    for c, inc_dirs, defs in source_info:
-
-        cmd = ([CHECKER_BIN] +
-               CHECKER_ARGS +
-               [c] +
-               [("-I%s" % i) for i in inc_dirs] +
-               [("-D%s" % d) for d in defs] +
-               source_defines
-               )
-
-        check_commands.append((c, cmd))
-
-    def my_process(i, c, cmd):
-        if not USE_QUIET:
-            percent = 100.0 * (i / len(check_commands))
-            percent_str = "[" + ("%.2f]" % percent).rjust(7) + " %:"
-
-            sys.stdout.flush()
-            sys.stdout.write("%s %s\n" % (percent_str, c))
-
-        return subprocess.Popen(cmd)
-
-    process_functions = []
-    for i, (c, cmd) in enumerate(check_commands):
-        process_functions.append((my_process, (i, c, cmd)))
-
-    project_source_info.queue_processes(process_functions)
-
-
-if __name__ == "__main__":
-    main()
--- a/build_files/cmake/cmake_static_check_sparse.py
+++ b/build_files/cmake/cmake_static_check_sparse.py
@@ -1,56 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0-or-later
-
-CHECKER_IGNORE_PREFIX = [
-    "extern",
-    "intern/moto",
-]
-
-CHECKER_BIN = "sparse"
-CHECKER_ARGS = [
-]
-
-import project_source_info
-import subprocess
-import sys
-import os
-
-USE_QUIET = (os.environ.get("QUIET", None) is not None)
-
-
-def main():
-    source_info = project_source_info.build_info(use_cxx=False, ignore_prefix_list=CHECKER_IGNORE_PREFIX)
-    source_defines = project_source_info.build_defines_as_args()
-
-    check_commands = []
-    for c, inc_dirs, defs in source_info:
-
-        cmd = ([CHECKER_BIN] +
-               CHECKER_ARGS +
-               [c] +
-               [("-I%s" % i) for i in inc_dirs] +
-               [("-D%s" % d) for d in defs] +
-               source_defines
-               )
-
-        check_commands.append((c, cmd))
-
-    def my_process(i, c, cmd):
-        if not USE_QUIET:
-            percent = 100.0 * (i / len(check_commands))
-            percent_str = "[" + ("%.2f]" % percent).rjust(7) + " %:"
-
-            sys.stdout.flush()
-            sys.stdout.write("%s %s\n" % (percent_str, c))
-
-        return subprocess.Popen(cmd)
-
-    process_functions = []
-    for i, (c, cmd) in enumerate(check_commands):
-        process_functions.append((my_process, (i, c, cmd)))
-
-    project_source_info.queue_processes(process_functions)
-
-
-if __name__ == "__main__":
-    main()
--- a/build_files/cmake/cmake_static_check_splint.py
+++ b/build_files/cmake/cmake_static_check_splint.py
@@ -1,86 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-License-Identifier: GPL-2.0-or-later
-
-CHECKER_IGNORE_PREFIX = [
-    "extern",
-    "intern/moto",
-]
-
-CHECKER_BIN = "splint"
-
-CHECKER_ARGS = [
-    "-weak",
-    "-posix-lib",
-    "-linelen", "10000",
-    "+ignorequals",
-    "+relaxtypes",
-    "-retvalother",
-    "+matchanyintegral",
-    "+longintegral",
-    "+ignoresigns",
-    "-nestcomment",
-    "-predboolothers",
-    "-ifempty",
-    "-unrecogcomments",
-
-    # we may want to remove these later
-    "-type",
-    "-fixedformalarray",
-    "-fullinitblock",
-    "-fcnuse",
-    "-initallelements",
-    "-castfcnptr",
-    # -forcehints,
-    "-bufferoverflowhigh",  # warns a lot about sprintf()
-
-    # re-definitions, rna causes most of these
-    "-redef",
-    "-syntax",
-
-    # dummy, witjout this splint complains with:
-    #  /usr/include/bits/confname.h:31:27: *** Internal Bug at cscannerHelp.c:2428: Unexpanded macro not function or constant: int _PC_MAX_CANON
-    "-D_PC_MAX_CANON=0",
-]
-
-
-import project_source_info
-import subprocess
-import sys
-import os
-
-USE_QUIET = (os.environ.get("QUIET", None) is not None)
-
-
-def main():
-    source_info = project_source_info.build_info(use_cxx=False, ignore_prefix_list=CHECKER_IGNORE_PREFIX)
-
-    check_commands = []
-    for c, inc_dirs, defs in source_info:
-        cmd = ([CHECKER_BIN] +
-               CHECKER_ARGS +
-               [c] +
-               [("-I%s" % i) for i in inc_dirs] +
-               [("-D%s" % d) for d in defs]
-               )
-
-        check_commands.append((c, cmd))
-
-    def my_process(i, c, cmd):
-        if not USE_QUIET:
-            percent = 100.0 * (i / len(check_commands))
-            percent_str = "[" + ("%.2f]" % percent).rjust(7) + " %:"
-
-            sys.stdout.write("%s %s\n" % (percent_str, c))
-            sys.stdout.flush()
-
-        return subprocess.Popen(cmd)
-
-    process_functions = []
-    for i, (c, cmd) in enumerate(check_commands):
-        process_functions.append((my_process, (i, c, cmd)))
-
-    project_source_info.queue_processes(process_functions)
-
-
-if __name__ == "__main__":
-    main()
--- a/build_files/cmake/config/blender_release.cmake
+++ b/build_files/cmake/config/blender_release.cmake
@@ -82,7 +82,7 @@ if(NOT APPLE)
  set(WITH_CYCLES_DEVICE_OPTIX    ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUDA_BINARIES   ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUBIN_COMPILER  OFF CACHE BOOL "" FORCE)
-  set(WITH_CYCLES_HIP_BINARIES    OFF CACHE BOOL "" FORCE)
+  set(WITH_CYCLES_HIP_BINARIES    ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_DEVICE_ONEAPI   ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_ONEAPI_BINARIES ON  CACHE BOOL "" FORCE)
 endif()
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -1384,4 +1384,3 @@ macro(windows_process_platform_bundled_libraries library_deps)
    endforeach()
  endif()
 endmacro()
-
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -174,7 +174,7 @@ if(SYSTEMSTUBS_LIBRARY)
  list(APPEND PLATFORM_LINKLIBS SystemStubs)
 endif()

-string(APPEND PLATFORM_CFLAGS " -pipe -funsigned-char -fno-strict-aliasing")
+string(APPEND PLATFORM_CFLAGS " -pipe -funsigned-char -fno-strict-aliasing -ffp-contract=off")
 set(PLATFORM_LINKFLAGS
  "-fexceptions -framework CoreServices -framework Foundation -framework IOKit -framework AppKit -framework Cocoa -framework Carbon -framework AudioUnit -framework AudioToolbox -framework CoreAudio -framework Metal -framework QuartzCore"
 )
--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@@ -803,8 +803,7 @@ if(CMAKE_COMPILER_IS_GNUCC)
  # Automatically turned on when building with "-march=native". This is
  # explicitly turned off here as it will make floating point math give a bit
  # different results. This will lead to automated test failures. So disable
-  # this until we support it. Seems to default to off in clang and the intel
-  # compiler.
+  # this until we support it.
  set(PLATFORM_CFLAGS "-pipe -fPIC -funsigned-char -fno-strict-aliasing -ffp-contract=off")

  # `maybe-uninitialized` is unreliable in release builds, but fine in debug builds.
@@ -815,64 +814,49 @@ if(CMAKE_COMPILER_IS_GNUCC)
  string(PREPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO "${GCC_EXTRA_FLAGS_RELEASE} ")
  unset(GCC_EXTRA_FLAGS_RELEASE)

-  # NOTE(@campbellbarton): Eventually mold will be able to use `-fuse-ld=mold`,
-  # however at the moment this only works for GCC 12.1+ (unreleased at time of writing).
-  # So a workaround is used here "-B" which points to another path to find system commands
-  # such as `ld`.
  if(WITH_LINKER_MOLD AND _IS_LINKER_DEFAULT)
    find_program(MOLD_BIN "mold")
    mark_as_advanced(MOLD_BIN)
+
    if(NOT MOLD_BIN)
      message(STATUS "The \"mold\" binary could not be found, using system linker.")
      set(WITH_LINKER_MOLD OFF)
+    elseif(CMAKE_C_COMPILER_VERSION VERSION_LESS 12.1)
+      message(STATUS "GCC 12.1 or newer is required for th MOLD linker.")
+      set(WITH_LINKER_MOLD OFF)
    else()
-      # By default mold installs the binary to:
-      # - `{PREFIX}/bin/mold` as well as a symbolic-link in...
-      # - `{PREFIX}/lib/mold/ld`.
-      # (where `PREFIX` is typically `/usr/`).
-      #
-      # This block of code finds `{PREFIX}/lib/mold` from the `mold` binary.
-      # Other methods of searching for the path could also be made to work,
-      # we could even make our own directory and symbolic-link, however it's more
-      # convenient to use the one provided by mold.
-      #
-      # Use the binary path to "mold", to find the common prefix which contains "lib/mold".
-      # The parent directory: e.g. `/usr/bin/mold` -> `/usr/bin/`.
-      get_filename_component(MOLD_PREFIX "${MOLD_BIN}" DIRECTORY)
-      # The common prefix path: e.g. `/usr/bin/` -> `/usr/` to use as a hint.
-      get_filename_component(MOLD_PREFIX "${MOLD_PREFIX}" DIRECTORY)
-      # Find `{PREFIX}/lib/mold/ld`, store the directory component (without the `ld`).
-      # Then pass `-B {PREFIX}/lib/mold` to GCC so the `ld` located there overrides the default.
-      find_path(
-        MOLD_BIN_DIR "ld"
-        HINTS "${MOLD_PREFIX}"
-        # The default path is `libexec`, Arch Linux for e.g.
-        # replaces this with `lib` so check both.
-        PATH_SUFFIXES "libexec/mold" "lib/mold" "lib64/mold"
-        NO_DEFAULT_PATH
-        NO_CACHE
+      get_filename_component(MOLD_BIN_DIR "${MOLD_BIN}" DIRECTORY)
+      # Check if the `-B` argument is required.
+      # This will happen when `MOLD_BIN` points to a non-standard location.
+      # Keep this option as mold is not yet a standard system component and
+      # users may have it installed in some unexpected place.
+      set(_mold_args "-fuse-ld=mold")
+      execute_process(
+        COMMAND ${CMAKE_C_COMPILER} -B ${MOLD_BIN_DIR} ${_mold_args} -Wl,--version
+        ERROR_QUIET OUTPUT_VARIABLE LD_VERSION_WITH_DIR
      )
-      if(NOT MOLD_BIN_DIR)
-        message(STATUS
-          "The mold linker could not find the directory containing the linker command "
-          "(typically "
-          "\"${MOLD_PREFIX}/libexec/mold/ld\") or "
-          "\"${MOLD_PREFIX}/lib/mold/ld\") using system linker."
-        )
-        set(WITH_LINKER_MOLD OFF)
+      execute_process(
+        COMMAND ${CMAKE_C_COMPILER} ${_mold_args} -Wl,--version
+        ERROR_QUIET OUTPUT_VARIABLE LD_VERSION
+      )
+      if(NOT (LD_VERSION STREQUAL LD_VERSION_WITH_DIR))
+        string(PREPEND _mold_args "-B \"${MOLD_BIN_DIR}\" ")
+        set(LD_VERSION "${LD_VERSION_WITH_DIR}")
      endif()
-      unset(MOLD_PREFIX)
-    endif()

-    if(WITH_LINKER_MOLD)
-      # GCC will search for `ld` in this directory first.
-      string(APPEND CMAKE_EXE_LINKER_FLAGS    " -B \"${MOLD_BIN_DIR}\"")
-      string(APPEND CMAKE_SHARED_LINKER_FLAGS " -B \"${MOLD_BIN_DIR}\"")
-      string(APPEND CMAKE_MODULE_LINKER_FLAGS " -B \"${MOLD_BIN_DIR}\"")
-      set(_IS_LINKER_DEFAULT OFF)
+      if("${LD_VERSION}" MATCHES "mold ")
+        string(APPEND CMAKE_EXE_LINKER_FLAGS    " ${_mold_args}")
+        string(APPEND CMAKE_SHARED_LINKER_FLAGS " ${_mold_args}")
+        string(APPEND CMAKE_MODULE_LINKER_FLAGS " ${_mold_args}")
+        set(_IS_LINKER_DEFAULT OFF)
+      else()
+        message(STATUS "GNU mold linker isn't available, using the default system linker.")
+      endif()
+      unset(_mold_args)
+      unset(MOLD_BIN_DIR)
+      unset(LD_VERSION)
    endif()
    unset(MOLD_BIN)
-    unset(MOLD_BIN_DIR)
  endif()

  if(WITH_LINKER_GOLD AND _IS_LINKER_DEFAULT)
@@ -907,7 +891,7 @@ if(CMAKE_COMPILER_IS_GNUCC)

 # CLang is the same as GCC for now.
 elseif(CMAKE_C_COMPILER_ID MATCHES "Clang")
-  set(PLATFORM_CFLAGS "-pipe -fPIC -funsigned-char -fno-strict-aliasing")
+  set(PLATFORM_CFLAGS "-pipe -fPIC -funsigned-char -fno-strict-aliasing -ffp-contract=off")

  if(WITH_LINKER_MOLD AND _IS_LINKER_DEFAULT)
    find_program(MOLD_BIN "mold")
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -9,7 +9,7 @@ buildbot:
    cuda11:
        version: '11.4.1'
    hip:
-        version: '5.3.22480'
+        version: '5.5.30571'
    optix:
        version: '7.3.0'
    ocloc:
--- a/doc/manpage/blender.1.py
+++ b/doc/manpage/blender.1.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0-or-later

-'''
+"""
 This script generates the blender.1 man page, embedding the help text
 from the Blender executable itself. Invoke it as follows:

@@ -9,7 +9,7 @@ from the Blender executable itself. Invoke it as follows:

 where <path-to-blender> is the path to the Blender executable,
 and <output-filename> is where to write the generated man page.
-'''
+"""

 import argparse
 import os
@@ -87,29 +87,29 @@ def man_page_from_blender_help(fh: TextIO, blender_bin: str, verbose: bool) -> N
        (blender_info["date"], blender_info["version"].replace(".", "\\&."))
    )

-    fh.write(r'''
+    fh.write(r"""
 .SH NAME
-blender \- a full-featured 3D application''')
+blender \- a full-featured 3D application""")

-    fh.write(r'''
+    fh.write(r"""
 .SH SYNOPSIS
-.B blender [args ...] [file] [args ...]''')
+.B blender [args ...] [file] [args ...]""")

-    fh.write(r'''
+    fh.write(r"""
 .br
 .SH DESCRIPTION
 .PP
 .B blender
-is a full-featured 3D application. It supports the entirety of the 3D pipeline - '''
-             '''modeling, rigging, animation, simulation, rendering, compositing, motion tracking, and video editing.
+is a full-featured 3D application. It supports the entirety of the 3D pipeline - """
+             """modeling, rigging, animation, simulation, rendering, compositing, motion tracking, and video editing.

-Use Blender to create 3D images and animations, films and commercials, content for games, '''
-             r'''architectural and industrial visualizations, and scientific visualizations.
+Use Blender to create 3D images and animations, films and commercials, content for games, """
+             r"""architectural and industrial visualizations, and scientific visualizations.

-https://www.blender.org''')
+https://www.blender.org""")

-    fh.write(r'''
-.SH OPTIONS''')
+    fh.write(r"""
+.SH OPTIONS""")

    fh.write("\n\n")

@@ -152,7 +152,7 @@ https://www.blender.org''')

    # Footer Content.

-    fh.write(r'''
+    fh.write(r"""
 .br
 .SH SEE ALSO
 .B luxrender(1)
@@ -162,7 +162,7 @@ https://www.blender.org''')
 This manpage was written for a Debian GNU/Linux system by Daniel Mester
 <mester@uni-bremen.de> and updated by Cyril Brulebois
 <cyril.brulebois@enst-bretagne.fr> and Dan Eicher <dan@trollwerks.org>.
-''')
+""")


 def create_argparse() -> argparse.ArgumentParser:
--- a/doc/python_api/rst/info_gotcha.rst
+++ b/doc/python_api/rst/info_gotcha.rst
@@ -865,29 +865,40 @@ Unfortunate Corner Cases
 Besides all expected cases listed above, there are a few others that should not be
 an issue but, due to internal implementation details, currently are:

- ``Object.hide_viewport``, ``Object.hide_select`` and ``Object.hide_render``:
-  Setting any of those Booleans will trigger a rebuild of Collection caches,
-  thus breaking any current iteration over ``Collection.all_objects``.
+
+Collection Objects
+^^^^^^^^^^^^^^^^^^
+
+Changing: ``Object.hide_viewport``, ``Object.hide_select`` or ``Object.hide_render``
+will trigger a rebuild of Collection caches, thus breaking any current iteration over ``Collection.all_objects``.
+
+ .. rubric:: Do not:
+
+ .. code-block:: python
+
+    # `all_objects` is an iterator. Using it directly while performing operations on its members that will update
+    # the memory accessed by the `all_objects` iterator will lead to invalid memory accesses and crashes.
+    for object in bpy.data.collections["Collection"].all_objects:
+         object.hide_viewport = True


-.. rubric:: Do not:
+ .. rubric:: Do:

-.. code-block:: python
+ .. code-block:: python

-   # `all_objects` is an iterator. Using it directly while performing operations on its members that will update
-   # the memory accessed by the `all_objects` iterator will lead to invalid memory accesses and crashes.
-   for object in bpy.data.collections["Collection"].all_objects:
-        object.hide_viewport = True
+    # `all_objects[:]` is an independent list generated from the iterator. As long as no objects are deleted,
+    # its content will remain valid even if the data accessed by the `all_objects` iterator is modified.
+    for object in bpy.data.collections["Collection"].all_objects[:]:
+         object.hide_viewport = True


-.. rubric:: Do:
+Data-Blocks Renaming During Iteration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-.. code-block:: python
-
-   # `all_objects[:]` is an independent list generated from the iterator. As long as no objects are deleted,
-   # its content will remain valid even if the data accessed by the `all_objects` iterator is modified.
-   for object in bpy.data.collections["Collection"].all_objects[:]:
-        object.hide_viewport = True
+Data-blocks accessed from ``bpy.data`` are sorted when their name is set.
+Any loop that iterates of a data such as ``bpy.data.objects`` for example,
+and sets the objects ``name`` must get all items from the iterator first (typically by converting to a list or tuple)
+to avoid missing some objects and iterating over others multiple times.


 sys.exit
--- a/extern/mantaflow/preprocessed/plugin/advection.cpp
+++ b/extern/mantaflow/preprocessed/plugin/advection.cpp
@@ -572,7 +572,7 @@ template<class T> inline bool cmpMinMax(T &minv, T &maxv, const T &val)
 }
 template<> inline bool cmpMinMax<Vec3>(Vec3 &minv, Vec3 &maxv, const Vec3 &val)
 {
-  return (cmpMinMax(minv.x, maxv.x, val.x) | cmpMinMax(minv.y, maxv.y, val.y) |
+  return (cmpMinMax(minv.x, maxv.x, val.x) || cmpMinMax(minv.y, maxv.y, val.y) ||
          cmpMinMax(minv.z, maxv.z, val.z));
 }

--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -281,6 +281,9 @@ endif()

 if(WITH_CYCLES_EMBREE)
  add_definitions(-DWITH_EMBREE)
+  if(WITH_CYCLES_DEVICE_ONEAPI AND EMBREE_SYCL_SUPPORT)
+    add_definitions(-DWITH_EMBREE_GPU)
+  endif()
  add_definitions(-DEMBREE_MAJOR_VERSION=${EMBREE_MAJOR_VERSION})
  include_directories(
    SYSTEM
--- a/intern/cycles/blender/addon/init.py
+++ b/intern/cycles/blender/addon/init.py
@@ -106,7 +106,7 @@ class CyclesRender(bpy.types.RenderEngine):
            from . import osl
            osl.update_script_node(node, self.report)
        else:
-            self.report({'ERROR'}, "OSL support disabled in this build.")
+            self.report({'ERROR'}, "OSL support disabled in this build")

    def update_render_passes(self, scene, srl):
        engine.register_passes(self, scene, srl)
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -172,6 +172,8 @@ def system_info():


 def list_render_passes(scene, srl):
+    import _cycles
+
    crl = srl.cycles

    # Combined pass.
@@ -250,6 +252,12 @@ def list_render_passes(scene, srl):
    for lightgroup in srl.lightgroups:
        yield ("Combined_%s" % lightgroup.name, "RGB", 'COLOR')

+    # Path guiding debug passes.
+    if _cycles.with_debug:
+        yield ("Guiding Color", "RGB", 'COLOR')
+        yield ("Guiding Probability", "X", 'VALUE')
+        yield ("Guiding Average Roughness", "X", 'VALUE')
+

 def register_passes(engine, scene, view_layer):
    for name, channelids, channeltype in list_render_passes(scene, view_layer):
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1544,6 +1544,13 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        default=False,
    )

+    use_oneapirt: BoolProperty(
+        name="Embree on GPU (Experimental)",
+        description="Embree GPU execution will allow to use hardware ray tracing on Intel GPUs, which will provide better performance. "
+                    "However this support is experimental and some scenes may render incorrectly",
+        default=False,
+    )
+
    kernel_optimization_level: EnumProperty(
        name="Kernel Optimization",
        description="Kernels can be optimized based on scene content. Optimized kernels are requested at the start of a render. "
@@ -1676,16 +1683,16 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                col.label(text=iface_("and NVIDIA driver version %s or newer") % driver_version,
                          icon='BLANK1', translate=False)
            elif device_type == 'HIP':
-                if True:
-                    col.label(text="HIP temporarily disabled due to compiler bugs", icon='BLANK1')
-                else:
-                    import sys
-                    if sys.platform[:3] == "win":
-                        driver_version = "21.Q4"
-                        col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
-                        col.label(text=iface_("and AMD Radeon Pro %s driver or newer") % driver_version,
-                                  icon='BLANK1', translate=False)
-                    elif sys.platform.startswith("linux"):
+                import sys
+                if sys.platform[:3] == "win":
+                    driver_version = "21.Q4"
+                    col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
+                    col.label(text=iface_("and AMD Radeon Pro %s driver or newer") % driver_version,
+                              icon='BLANK1', translate=False)
+                elif sys.platform.startswith("linux"):
+                    if True:
+                        col.label(text="HIP temporarily disabled due to compiler bugs", icon='BLANK1')
+                    else:
                        driver_version = "22.10"
                        col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
                        col.label(text=iface_("and AMD driver version %s or newer") % driver_version, icon='BLANK1',
@@ -1763,6 +1770,11 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                    col.prop(self, "kernel_optimization_level")
                col.prop(self, "use_metalrt")

+        if compute_device_type == 'ONEAPI' and _cycles.with_embree_gpu:
+            row = layout.row()
+            row.use_property_split = True
+            row.prop(self, "use_oneapirt")
+
    def draw(self, context):
        self.draw_impl(self.layout, context)

--- a/intern/cycles/blender/curves.cpp
+++ b/intern/cycles/blender/curves.cpp
@@ -803,6 +803,16 @@ static void attr_create_generic(Scene *scene,
            num_curves, num_keys, data, element, [&](int i) { return float(src[i]); });
        break;
      }
+      case BL::Attribute::data_type_INT32_2D: {
+        BL::Int2Attribute b_int2_attribute{b_attribute};
+        const int2 *src = static_cast<const int2 *>(b_int2_attribute.data[0].ptr.data);
+        Attribute *attr = attributes.add(name, TypeFloat2, element);
+        float2 *data = attr->data_float2();
+        fill_generic_attribute(num_curves, num_keys, data, element, [&](int i) {
+          return make_float2(float(src[i][0]), float(src[i][1]));
+        });
+        break;
+      }
      case BL::Attribute::data_type_FLOAT_VECTOR: {
        BL::FloatVectorAttribute b_vector_attribute{b_attribute};
        const float(*src)[3] = static_cast<const float(*)[3]>(b_vector_attribute.data[0].ptr.data);
--- a/intern/cycles/blender/device.cpp
+++ b/intern/cycles/blender/device.cpp
@@ -112,9 +112,26 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences,
    device.has_peer_memory = false;
  }

-  if (get_boolean(cpreferences, "use_metalrt")) {
-    device.use_metalrt = true;
+  bool accumulated_use_hardware_raytracing = false;
+  foreach (
+      DeviceInfo &info,
+      (device.multi_devices.size() != 0 ? device.multi_devices : vector<DeviceInfo>({device}))) {
+    if (info.type == DEVICE_METAL && !get_boolean(cpreferences, "use_metalrt")) {
+      info.use_hardware_raytracing = false;
+    }
+
+    if (info.type == DEVICE_ONEAPI && !get_boolean(cpreferences, "use_oneapirt")) {
+      info.use_hardware_raytracing = false;
+    }
+
+    /* There is an accumulative logic here, because Multi-devices are support only for
+     * the same backend + CPU in Blender right now, and both oneAPI and Metal have a
+     * global boolean backend setting (see above) for enabling/disabling HW RT,
+     * so all sub-devices in the multi-device should enable (or disable) HW RT
+     * simultaneously (and CPU device are expected to ignore `use_hardware_raytracing` setting). */
+    accumulated_use_hardware_raytracing |= info.use_hardware_raytracing;
  }
+  device.use_hardware_raytracing = accumulated_use_hardware_raytracing;

  if (preview) {
    /* Disable specialization for preview renders. */
--- a/intern/cycles/blender/mesh.cpp
+++ b/intern/cycles/blender/mesh.cpp
@@ -280,7 +280,7 @@ static void fill_generic_attribute(BL::Mesh &b_mesh,
        assert(0);
      }
      else {
-        const MEdge *edges = static_cast<const MEdge *>(b_mesh.edges[0].ptr.data);
+        const int2 *edges = static_cast<const int2 *>(b_mesh.edges[0].ptr.data);
        const size_t verts_num = b_mesh.vertices.length();
        vector<int> count(verts_num, 0);

@@ -288,11 +288,11 @@ static void fill_generic_attribute(BL::Mesh &b_mesh,
        for (int i = 0; i < edges_num; i++) {
          TypeInCycles value = get_value_at_index(i);

-          const MEdge &b_edge = edges[i];
-          data[b_edge.v1] += value;
-          data[b_edge.v2] += value;
-          count[b_edge.v1]++;
-          count[b_edge.v2]++;
+          const int2 &b_edge = edges[i];
+          data[b_edge[0]] += value;
+          data[b_edge[1]] += value;
+          count[b_edge[0]]++;
+          count[b_edge[1]]++;
        }

        for (size_t i = 0; i < verts_num; i++) {
@@ -528,6 +528,19 @@ static void attr_create_generic(Scene *scene,
        });
        break;
      }
+      case BL::Attribute::data_type_INT32_2D: {
+        BL::Int2Attribute b_int2_attribute{b_attribute};
+        if (b_int2_attribute.data.length() == 0) {
+          continue;
+        }
+        const int2 *src = static_cast<const int2 *>(b_int2_attribute.data[0].ptr.data);
+        Attribute *attr = attributes.add(name, TypeFloat2, element);
+        float2 *data = attr->data_float2();
+        fill_generic_attribute(b_mesh, data, b_domain, subdivision, [&](int i) {
+          return make_float2(float(src[i][0]), float(src[i][1]));
+        });
+        break;
+      }
      default:
        /* Not supported. */
        break;
@@ -783,13 +796,13 @@ static void attr_create_pointiness(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, b
  EdgeMap visited_edges;
  memset(&counter[0], 0, sizeof(int) * counter.size());

-  const MEdge *edges = static_cast<MEdge *>(b_mesh.edges[0].ptr.data);
+  const int2 *edges = static_cast<int2 *>(b_mesh.edges[0].ptr.data);
  const int edges_num = b_mesh.edges.length();

  for (int i = 0; i < edges_num; i++) {
-    const MEdge &b_edge = edges[i];
-    const int v0 = vert_orig_index[b_edge.v1];
-    const int v1 = vert_orig_index[b_edge.v2];
+    const int2 &b_edge = edges[i];
+    const int v0 = vert_orig_index[b_edge[0]];
+    const int v1 = vert_orig_index[b_edge[1]];
    if (visited_edges.exists(v0, v1)) {
      continue;
    }
@@ -825,9 +838,9 @@ static void attr_create_pointiness(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, b
  memset(&counter[0], 0, sizeof(int) * counter.size());
  visited_edges.clear();
  for (int i = 0; i < edges_num; i++) {
-    const MEdge &b_edge = edges[i];
-    const int v0 = vert_orig_index[b_edge.v1];
-    const int v1 = vert_orig_index[b_edge.v2];
+    const int2 &b_edge = edges[i];
+    const int v0 = vert_orig_index[b_edge[0]];
+    const int v1 = vert_orig_index[b_edge[1]];
    if (visited_edges.exists(v0, v1)) {
      continue;
    }
@@ -894,12 +907,12 @@ static void attr_create_random_per_island(Scene *scene,

  DisjointSet vertices_sets(number_of_vertices);

-  const MEdge *edges = static_cast<MEdge *>(b_mesh.edges[0].ptr.data);
+  const int2 *edges = static_cast<int2 *>(b_mesh.edges[0].ptr.data);
  const int edges_num = b_mesh.edges.length();
  const int *corner_verts = find_corner_vert_attribute(b_mesh);

  for (int i = 0; i < edges_num; i++) {
-    vertices_sets.join(edges[i].v1, edges[i].v2);
+    vertices_sets.join(edges[i][0], edges[i][1]);
  }

  AttributeSet &attributes = (subdivision) ? mesh->subd_attributes : mesh->attributes;
@@ -1221,12 +1234,12 @@ static void create_subd_mesh(Scene *scene,

    mesh->reserve_subd_creases(num_creases);

-    const MEdge *edges = static_cast<MEdge *>(b_mesh.edges[0].ptr.data);
+    const int2 *edges = static_cast<int2 *>(b_mesh.edges[0].ptr.data);
    for (int i = 0; i < edges_num; i++) {
      const float crease = creases[i];
      if (crease != 0.0f) {
-        const MEdge &b_edge = edges[i];
-        mesh->add_edge_crease(b_edge.v1, b_edge.v2, crease);
+        const int2 &b_edge = edges[i];
+        mesh->add_edge_crease(b_edge[0], b_edge[1], crease);
      }
    }
  }
--- a/intern/cycles/blender/pointcloud.cpp
+++ b/intern/cycles/blender/pointcloud.cpp
@@ -102,6 +102,16 @@ static void copy_attributes(PointCloud *pointcloud,
        }
        break;
      }
+      case BL::Attribute::data_type_INT32_2D: {
+        BL::Int2Attribute b_int2_attribute{b_attribute};
+        const int2 *src = static_cast<const int2 *>(b_int2_attribute.data[0].ptr.data);
+        Attribute *attr = attributes.add(name, TypeFloat2, element);
+        float2 *data = attr->data_float2();
+        for (int i = 0; i < num_points; i++) {
+          data[i] = make_float2(float(src[i][0]), float(src[i][1]));
+        }
+        break;
+      }
      case BL::Attribute::data_type_FLOAT_VECTOR: {
        BL::FloatVectorAttribute b_vector_attribute{b_attribute};
        const float(*src)[3] = static_cast<const float(*)[3]>(b_vector_attribute.data[0].ptr.data);
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -1034,6 +1034,14 @@ void *CCL_python_module_init()
  Py_INCREF(Py_False);
 #endif /* WITH_EMBREE */

+#ifdef WITH_EMBREE_GPU
+  PyModule_AddObject(mod, "with_embree_gpu", Py_True);
+  Py_INCREF(Py_True);
+#else  /* WITH_EMBREE_GPU */
+  PyModule_AddObject(mod, "with_embree_gpu", Py_False);
+  Py_INCREF(Py_False);
+#endif /* WITH_EMBREE_GPU */
+
  if (ccl::openimagedenoise_supported()) {
    PyModule_AddObject(mod, "with_openimagedenoise", Py_True);
    Py_INCREF(Py_True);
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -1061,7 +1061,7 @@ void BlenderSession::ensure_display_driver_if_needed()
  unique_ptr<BlenderDisplayDriver> display_driver = make_unique<BlenderDisplayDriver>(
      b_engine, b_scene, background);
  display_driver_ = display_driver.get();
-  session->set_display_driver(move(display_driver));
+  session->set_display_driver(std::move(display_driver));
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -981,22 +981,8 @@ static ShaderNode *add_node(Scene *scene,
    sky->set_sun_disc(b_sky_node.sun_disc());
    sky->set_sun_size(b_sky_node.sun_size());
    sky->set_sun_intensity(b_sky_node.sun_intensity());
-    /* Patch sun position to be able to animate daylight cycle while keeping the shading code
-     * simple. */
-    float sun_rotation = b_sky_node.sun_rotation();
-    /* Wrap into [-2PI..2PI] range. */
-    float sun_elevation = fmodf(b_sky_node.sun_elevation(), M_2PI_F);
-    /* Wrap into [-PI..PI] range. */
-    if (fabsf(sun_elevation) >= M_PI_F) {
-      sun_elevation -= copysignf(2.0f, sun_elevation) * M_PI_F;
-    }
-    /* Wrap into [-PI/2..PI/2] range while keeping the same absolute position. */
-    if (sun_elevation >= M_PI_2_F || sun_elevation <= -M_PI_2_F) {
-      sun_elevation = copysignf(M_PI_F, sun_elevation) - sun_elevation;
-      sun_rotation += M_PI_F;
-    }
-    sky->set_sun_elevation(sun_elevation);
-    sky->set_sun_rotation(sun_rotation);
+    sky->set_sun_elevation(b_sky_node.sun_elevation());
+    sky->set_sun_rotation(b_sky_node.sun_rotation());
    sky->set_altitude(b_sky_node.altitude());
    sky->set_air_density(b_sky_node.air_density());
    sky->set_dust_density(b_sky_node.dust_density());
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -634,6 +634,10 @@ static bool get_known_pass_type(BL::RenderPass &b_pass, PassType &type, PassMode
  MAP_PASS("AdaptiveAuxBuffer", PASS_ADAPTIVE_AUX_BUFFER, false);
  MAP_PASS("Debug Sample Count", PASS_SAMPLE_COUNT, false);

+  MAP_PASS("Guiding Color", PASS_GUIDING_COLOR, false);
+  MAP_PASS("Guiding Probability", PASS_GUIDING_PROBABILITY, false);
+  MAP_PASS("Guiding Average Roughness", PASS_GUIDING_AVG_ROUGHNESS, false);
+
  if (string_startswith(name, cryptomatte_prefix)) {
    type = PASS_CRYPTOMATTE;
    mode = PassMode::DENOISED;
@@ -684,18 +688,6 @@ void BlenderSync::sync_render_passes(BL::RenderLayer &b_rlay, BL::ViewLayer &b_v
  }
  scene->film->set_cryptomatte_passes(cryptomatte_passes);

-  /* Path guiding debug passes. */
-#ifdef WITH_CYCLES_DEBUG
-  b_engine.add_pass("Guiding Color", 3, "RGB", b_view_layer.name().c_str());
-  pass_add(scene, PASS_GUIDING_COLOR, "Guiding Color", PassMode::NOISY);
-
-  b_engine.add_pass("Guiding Probability", 1, "X", b_view_layer.name().c_str());
-  pass_add(scene, PASS_GUIDING_PROBABILITY, "Guiding Probability", PassMode::NOISY);
-
-  b_engine.add_pass("Guiding Average Roughness", 1, "X", b_view_layer.name().c_str());
-  pass_add(scene, PASS_GUIDING_AVG_ROUGHNESS, "Guiding Average Roughness", PassMode::NOISY);
-#endif
-
  unordered_set<string> expected_passes;

  /* Custom AOV passes. */
--- a/intern/cycles/bvh/build.cpp
+++ b/intern/cycles/bvh/build.cpp
@@ -527,7 +527,7 @@ BVHNode *BVHBuild::run()
    if (progress.get_cancel()) {
      rootnode->deleteSubtree();
      rootnode = NULL;
-      VLOG_WORK << "BVH build cancelled.";
+      VLOG_WORK << "BVH build canceled.";
    }
    else {
      /*rotate(rootnode, 4, 5);*/
--- a/intern/cycles/bvh/bvh2.cpp
+++ b/intern/cycles/bvh/bvh2.cpp
@@ -606,7 +606,7 @@ void BVH2::pack_instances(size_t nodes_size, size_t leaf_nodes_size)
      int4 *bvh_nodes = &bvh->pack.nodes[0];
      size_t bvh_nodes_size = bvh->pack.nodes.size();

-      for (size_t i = 0, j = 0; i < bvh_nodes_size; j++) {
+      for (size_t i = 0; i < bvh_nodes_size;) {
        size_t nsize, nsize_bbox;
        if (bvh_nodes[i].x & PATH_RAY_NODE_UNALIGNED) {
          nsize = BVH_UNALIGNED_NODE_SIZE;
--- a/intern/cycles/bvh/embree.cpp
+++ b/intern/cycles/bvh/embree.cpp
@@ -111,9 +111,13 @@ BVHEmbree::~BVHEmbree()
  }
 }

-void BVHEmbree::build(Progress &progress, Stats *stats, RTCDevice rtc_device_)
+void BVHEmbree::build(Progress &progress,
+                      Stats *stats,
+                      RTCDevice rtc_device_,
+                      const bool rtc_device_is_sycl_)
 {
  rtc_device = rtc_device_;
+  rtc_device_is_sycl = rtc_device_is_sycl_;
  assert(rtc_device);

  rtcSetDeviceErrorFunction(rtc_device, rtc_error_func, NULL);
@@ -266,15 +270,29 @@ void BVHEmbree::add_triangles(const Object *ob, const Mesh *mesh, int i)
  rtcSetGeometryTimeStepCount(geom_id, num_motion_steps);

  const int *triangles = mesh->get_triangles().data();
-  rtcSetSharedGeometryBuffer(geom_id,
-                             RTC_BUFFER_TYPE_INDEX,
-                             0,
-                             RTC_FORMAT_UINT3,
-                             triangles,
-                             0,
-                             sizeof(int) * 3,
-                             num_triangles);
-
+  if (!rtc_device_is_sycl) {
+    rtcSetSharedGeometryBuffer(geom_id,
+                               RTC_BUFFER_TYPE_INDEX,
+                               0,
+                               RTC_FORMAT_UINT3,
+                               triangles,
+                               0,
+                               sizeof(int) * 3,
+                               num_triangles);
+  }
+  else {
+    /* NOTE(sirgienko): If the Embree device is a SYCL device, then Embree execution will
+     * happen on GPU, and we cannot use standard host pointers at this point. So instead
+     * of making a shared geometry buffer - a new Embree buffer will be created and data
+     * will be copied. */
+    int *triangles_buffer = (int *)rtcSetNewGeometryBuffer(
+        geom_id, RTC_BUFFER_TYPE_INDEX, 0, RTC_FORMAT_UINT3, sizeof(int) * 3, num_triangles);
+    assert(triangles_buffer);
+    if (triangles_buffer) {
+      static_assert(sizeof(int) == sizeof(uint));
+      std::memcpy(triangles_buffer, triangles, sizeof(int) * 3 * (num_triangles));
+    }
+  }
  set_tri_vertex_buffer(geom_id, mesh, false);

  rtcSetGeometryUserData(geom_id, (void *)prim_offset);
@@ -323,14 +341,38 @@ void BVHEmbree::set_tri_vertex_buffer(RTCGeometry geom_id, const Mesh *mesh, con
      rtcUpdateGeometryBuffer(geom_id, RTC_BUFFER_TYPE_VERTEX, t);
    }
    else {
-      rtcSetSharedGeometryBuffer(geom_id,
-                                 RTC_BUFFER_TYPE_VERTEX,
-                                 t,
-                                 RTC_FORMAT_FLOAT3,
-                                 verts,
-                                 0,
-                                 sizeof(float3),
-                                 num_verts + 1);
+      if (!rtc_device_is_sycl) {
+        rtcSetSharedGeometryBuffer(geom_id,
+                                   RTC_BUFFER_TYPE_VERTEX,
+                                   t,
+                                   RTC_FORMAT_FLOAT3,
+                                   verts,
+                                   0,
+                                   sizeof(float3),
+                                   num_verts + 1);
+      }
+      else {
+        /* NOTE(sirgienko): If the Embree device is a SYCL device, then Embree execution will
+         * happen on GPU, and we cannot use standard host pointers at this point. So instead
+         * of making a shared geometry buffer - a new Embree buffer will be created and data
+         * will be copied. */
+        /* As float3 is packed on GPU side, we map it to packed_float3. */
+        packed_float3 *verts_buffer = (packed_float3 *)rtcSetNewGeometryBuffer(
+            geom_id,
+            RTC_BUFFER_TYPE_VERTEX,
+            t,
+            RTC_FORMAT_FLOAT3,
+            sizeof(packed_float3),
+            num_verts + 1);
+        assert(verts_buffer);
+        if (verts_buffer) {
+          for (size_t i = (size_t)0; i < num_verts + 1; ++i) {
+            verts_buffer[i].x = verts[i].x;
+            verts_buffer[i].y = verts[i].y;
+            verts_buffer[i].z = verts[i].z;
+          }
+        }
+      }
    }
  }
 }
--- a/intern/cycles/bvh/embree.h
+++ b/intern/cycles/bvh/embree.h
@@ -29,7 +29,10 @@ class PointCloud;

 class BVHEmbree : public BVH {
 public:
-  void build(Progress &progress, Stats *stats, RTCDevice rtc_device);
+  void build(Progress &progress,
+             Stats *stats,
+             RTCDevice rtc_device,
+             const bool isSyclEmbreeDevice = false);
  void refit(Progress &progress);

  RTCScene scene;
@@ -55,6 +58,7 @@ class BVHEmbree : public BVH {
                               const bool update);

  RTCDevice rtc_device;
+  bool rtc_device_is_sycl;
  enum RTCBuildQuality build_quality;
 };

--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -42,15 +42,19 @@ endif()
 ###########################################################################

 if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
-  set(WITH_CYCLES_HIP_BINARIES OFF)
-  message(STATUS "HIP temporarily disabled due to compiler bugs")
+  if(UNIX)
+    # Disabled until there is a HIP 5.5 release for Linux.
+    set(WITH_CYCLES_HIP_BINARIES OFF)
+    message(STATUS "HIP temporarily disabled due to compiler bugs")
+  else()
+    # Need at least HIP 5.5 to solve compiler bug affecting the kernel.
+    find_package(HIP 5.5.0)
+    set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES)

-  # find_package(HIP)
-  # set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES)
-
-  # if(HIP_FOUND)
-  #   message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})")
-  # endif()
+    if(HIP_FOUND)
+      message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})")
+    endif()
+  endif()
 endif()

 if(NOT WITH_HIP_DYNLOAD)
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -84,7 +84,7 @@ CPUDevice::~CPUDevice()
  texture_info.free();
 }

-BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
+BVHLayoutMask CPUDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
 {
  BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
 #ifdef WITH_EMBREE
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -56,7 +56,7 @@ class CPUDevice : public Device {
  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
  ~CPUDevice();

-  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+  virtual BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override;

  /* Returns true if the texture info was copied to the device (meaning, some more
   * re-initialization might be needed). */
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -35,7 +35,7 @@ bool CUDADevice::have_precompiled_kernels()
  return path_exists(cubins_path);
 }

-BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
+BVHLayoutMask CUDADevice::get_bvh_layout_mask(uint /*kernel_features*/) const
 {
  return BVH_LAYOUT_BVH2;
 }
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -38,7 +38,7 @@ class CUDADevice : public GPUDevice {

  static bool have_precompiled_kernels();

-  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+  virtual BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override;

  void set_error(const string &error) override;

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -354,7 +354,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
  info.has_guiding = true;
  info.has_profiling = true;
  info.has_peer_memory = false;
-  info.use_metalrt = false;
+  info.use_hardware_raytracing = false;
  info.denoisers = DENOISER_ALL;

  foreach (const DeviceInfo &device, subdevices) {
@@ -403,7 +403,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
    info.has_guiding &= device.has_guiding;
    info.has_profiling &= device.has_profiling;
    info.has_peer_memory |= device.has_peer_memory;
-    info.use_metalrt |= device.use_metalrt;
+    info.use_hardware_raytracing |= device.use_hardware_raytracing;
    info.denoisers &= device.denoisers;
  }

--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -71,15 +71,16 @@ class DeviceInfo {
  string description;
  string id; /* used for user preferences, should stay fixed with changing hardware config */
  int num;
-  bool display_device;  /* GPU is used as a display device. */
-  bool has_nanovdb;     /* Support NanoVDB volumes. */
-  bool has_light_tree;  /* Support light tree. */
-  bool has_osl;         /* Support Open Shading Language. */
-  bool has_guiding;     /* Support path guiding. */
-  bool has_profiling;   /* Supports runtime collection of profiling info. */
-  bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
-  bool has_gpu_queue;   /* Device supports GPU queue. */
-  bool use_metalrt;     /* Use MetalRT to accelerate ray queries (Metal only). */
+  bool display_device;          /* GPU is used as a display device. */
+  bool has_nanovdb;             /* Support NanoVDB volumes. */
+  bool has_light_tree;          /* Support light tree. */
+  bool has_osl;                 /* Support Open Shading Language. */
+  bool has_guiding;             /* Support path guiding. */
+  bool has_profiling;           /* Supports runtime collection of profiling info. */
+  bool has_peer_memory;         /* GPU has P2P access to memory of another GPU. */
+  bool has_gpu_queue;           /* Device supports GPU queue. */
+  bool use_hardware_raytracing; /* Use hardware ray tracing to accelerate ray queries in a backend.
+                                 */
  KernelOptimizationLevel kernel_optimization_level; /* Optimization level applied to path tracing
                                                      * kernels (Metal only). */
  DenoiserTypeMask denoisers;                        /* Supported denoiser types. */
@@ -101,7 +102,7 @@ class DeviceInfo {
    has_profiling = false;
    has_peer_memory = false;
    has_gpu_queue = false;
-    use_metalrt = false;
+    use_hardware_raytracing = false;
    denoisers = DENOISER_NONE;
  }

@@ -157,7 +158,7 @@ class Device {
    fprintf(stderr, "%s\n", error.c_str());
    fflush(stderr);
  }
-  virtual BVHLayoutMask get_bvh_layout_mask() const = 0;
+  virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const = 0;

  /* statistics */
  Stats &stats;
--- a/intern/cycles/device/dummy/device.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -20,7 +20,7 @@ class DummyDevice : public Device {

  ~DummyDevice() {}

-  virtual BVHLayoutMask get_bvh_layout_mask() const override
+  virtual BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override
  {
    return 0;
  }
--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@@ -137,7 +137,7 @@ void device_hip_info(vector<DeviceInfo> &devices)
    info.num = num;

    info.has_nanovdb = true;
-    info.has_light_tree = false;
+    info.has_light_tree = true;
    info.denoisers = 0;

    info.has_gpu_queue = true;
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -35,7 +35,7 @@ bool HIPDevice::have_precompiled_kernels()
  return path_exists(fatbins_path);
 }

-BVHLayoutMask HIPDevice::get_bvh_layout_mask() const
+BVHLayoutMask HIPDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
 {
  return BVH_LAYOUT_BVH2;
 }
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -35,7 +35,7 @@ class HIPDevice : public GPUDevice {

  static bool have_precompiled_kernels();

-  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+  virtual BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override;

  void set_error(const string &error) override;

--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -3,7 +3,9 @@

 #include "device/kernel.h"

-#include "util/log.h"
+#ifndef __KERNEL_ONEAPI__
+#  include "util/log.h"
+#endif

 CCL_NAMESPACE_BEGIN

@@ -153,10 +155,13 @@ const char *device_kernel_as_string(DeviceKernel kernel)
    case DEVICE_KERNEL_NUM:
      break;
  };
+#ifndef __KERNEL_ONEAPI__
  LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen.";
+#endif
  return "UNKNOWN";
 }

+#ifndef __KERNEL_ONEAPI__
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
 {
  os << device_kernel_as_string(kernel);
@@ -178,5 +183,6 @@ string device_kernel_mask_as_string(DeviceKernelMask mask)

  return str;
 }
+#endif

 CCL_NAMESPACE_END
--- a/intern/cycles/device/kernel.h
+++ b/intern/cycles/device/kernel.h
@@ -3,11 +3,13 @@

 #pragma once

-#include "kernel/types.h"
+#ifndef __KERNEL_ONEAPI__
+#  include "kernel/types.h"

-#include "util/string.h"
+#  include "util/string.h"

-#include <ostream>  // NOLINT
+#  include <ostream>  // NOLINT
+#endif

 CCL_NAMESPACE_BEGIN

@@ -15,9 +17,12 @@ bool device_kernel_has_shading(DeviceKernel kernel);
 bool device_kernel_has_intersection(DeviceKernel kernel);

 const char *device_kernel_as_string(DeviceKernel kernel);
+
+#ifndef __KERNEL_ONEAPI__
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);

 typedef uint64_t DeviceKernelMask;
 string device_kernel_mask_as_string(DeviceKernelMask mask);
+#endif

 CCL_NAMESPACE_END
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -100,7 +100,7 @@ class MetalDevice : public Device {

  virtual void cancel() override;

-  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+  virtual BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override;

  void set_error(const string &error) override;

--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -39,7 +39,7 @@ bool MetalDevice::is_device_cancelled(int ID)
  return get_device_by_ID(ID, lock) == nullptr;
 }

-BVHLayoutMask MetalDevice::get_bvh_layout_mask() const
+BVHLayoutMask MetalDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
 {
  return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2;
 }
@@ -100,12 +100,12 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    }
    case METAL_GPU_AMD: {
      max_threads_per_threadgroup = 128;
-      use_metalrt = info.use_metalrt;
+      use_metalrt = info.use_hardware_raytracing;
      break;
    }
    case METAL_GPU_APPLE: {
      max_threads_per_threadgroup = 512;
-      use_metalrt = info.use_metalrt;
+      use_metalrt = info.use_hardware_raytracing;
      break;
    }
  }
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -96,12 +96,13 @@ class MultiDevice : public Device {
    return error_msg;
  }

-  virtual BVHLayoutMask get_bvh_layout_mask() const override
+  virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override
  {
    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
    BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
    foreach (const SubDevice &sub_device, devices) {
-      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
+      BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask(
+          kernel_features);
      bvh_layout_mask &= device_bvh_layout_mask;
      bvh_layout_mask_all |= device_bvh_layout_mask;
    }
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -40,12 +40,12 @@ bool device_oneapi_init()
  if (getenv("SYCL_CACHE_TRESHOLD") == nullptr) {
    _putenv_s("SYCL_CACHE_THRESHOLD", "0");
  }
-  if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
+  if (getenv("ONEAPI_DEVICE_SELECTOR") == nullptr) {
    if (getenv("CYCLES_ONEAPI_ALL_DEVICES") == nullptr) {
-      _putenv_s("SYCL_DEVICE_FILTER", "level_zero");
+      _putenv_s("ONEAPI_DEVICE_SELECTOR", "level_zero:*");
    }
    else {
-      _putenv_s("SYCL_DEVICE_FILTER", "level_zero,cuda,hip");
+      _putenv_s("ONEAPI_DEVICE_SELECTOR", "!opencl:*");
    }
  }
  if (getenv("SYCL_ENABLE_PCI") == nullptr) {
@@ -58,10 +58,10 @@ bool device_oneapi_init()
  setenv("SYCL_CACHE_PERSISTENT", "1", false);
  setenv("SYCL_CACHE_THRESHOLD", "0", false);
  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") == nullptr) {
-    setenv("SYCL_DEVICE_FILTER", "level_zero", false);
+    setenv("ONEAPI_DEVICE_SELECTOR", "level_zero:*", false);
  }
  else {
-    setenv("SYCL_DEVICE_FILTER", "level_zero,cuda,hip", false);
+    setenv("ONEAPI_DEVICE_SELECTOR", "!opencl:*", false);
  }
  setenv("SYCL_ENABLE_PCI", "1", false);
  setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
@@ -87,7 +87,8 @@ Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &pro
 }

 #ifdef WITH_ONEAPI
-static void device_iterator_cb(const char *id, const char *name, int num, void *user_ptr)
+static void device_iterator_cb(
+    const char *id, const char *name, int num, bool hwrt_support, void *user_ptr)
 {
  vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr;

@@ -112,6 +113,13 @@ static void device_iterator_cb(const char *id, const char *name, int num, void *
  /* NOTE(@nsirgien): Seems not possible to know from SYCL/oneAPI or Level0. */
  info.display_device = false;

+#  ifdef WITH_EMBREE_GPU
+  info.use_hardware_raytracing = hwrt_support;
+#  else
+  info.use_hardware_raytracing = false;
+  (void)hwrt_support;
+#  endif
+
  devices->push_back(info);
  VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\".";
 }
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -8,7 +8,19 @@
 #  include "util/debug.h"
 #  include "util/log.h"

+#  ifdef WITH_EMBREE_GPU
+#    include "bvh/embree.h"
+#  endif
+
 #  include "kernel/device/oneapi/globals.h"
+#  include "kernel/device/oneapi/kernel.h"
+
+#  if defined(WITH_EMBREE_GPU) && defined(EMBREE_SYCL_SUPPORT) && !defined(SYCL_LANGUAGE_VERSION)
+/* These declarations are missing from embree headers when compiling from a compiler that doesn't
+ * support SYCL. */
+extern "C" RTCDevice rtcNewSYCLDevice(sycl::context context, const char *config);
+extern "C" bool rtcIsSYCLDeviceSupported(const sycl::device sycl_device);
+#  endif

 CCL_NAMESPACE_BEGIN

@@ -22,16 +34,29 @@ static void queue_error_cb(const char *message, void *user_ptr)
 OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
    : Device(info, stats, profiler),
      device_queue_(nullptr),
+#  ifdef WITH_EMBREE_GPU
+      embree_device(nullptr),
+      embree_scene(nullptr),
+#  endif
      texture_info_(this, "texture_info", MEM_GLOBAL),
      kg_memory_(nullptr),
      kg_memory_device_(nullptr),
      kg_memory_size_(0)
 {
  need_texture_info_ = false;
+  use_hardware_raytracing = info.use_hardware_raytracing;

  oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);

-  bool is_finished_ok = create_queue(device_queue_, info.num);
+  bool is_finished_ok = create_queue(device_queue_,
+                                     info.num,
+#  ifdef WITH_EMBREE_GPU
+                                     use_hardware_raytracing ? &embree_device : nullptr
+#  else
+                                     nullptr
+#  endif
+  );
+
  if (is_finished_ok == false) {
    set_error("oneAPI queue initialization error: got runtime exception \"" +
              oneapi_error_string_ + "\"");
@@ -42,6 +67,16 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
    assert(device_queue_);
  }

+#  ifdef WITH_EMBREE_GPU
+  use_hardware_raytracing = use_hardware_raytracing && (embree_device != nullptr);
+#  else
+  use_hardware_raytracing = false;
+#  endif
+
+  if (use_hardware_raytracing) {
+    VLOG_INFO << "oneAPI will use hardware ray tracing for intersection acceleration.";
+  }
+
  size_t globals_segment_size;
  is_finished_ok = kernel_globals_size(globals_segment_size);
  if (is_finished_ok == false) {
@@ -64,6 +99,11 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi

 OneapiDevice::~OneapiDevice()
 {
+#  ifdef WITH_EMBREE_GPU
+  if (embree_device)
+    rtcReleaseDevice(embree_device);
+#  endif
+
  texture_info_.free();
  usm_free(device_queue_, kg_memory_);
  usm_free(device_queue_, kg_memory_device_);
@@ -80,15 +120,47 @@ bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
  return false;
 }

-BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const
+bool OneapiDevice::can_use_hardware_raytracing_for_features(uint requested_features) const
 {
-  return BVH_LAYOUT_BVH2;
+  /* MNEE and Ray-trace kernels currently don't work correctly with HWRT. */
+  return !(requested_features & (KERNEL_FEATURE_MNEE | KERNEL_FEATURE_NODE_RAYTRACE));
 }

+BVHLayoutMask OneapiDevice::get_bvh_layout_mask(uint requested_features) const
+{
+  return (use_hardware_raytracing &&
+          can_use_hardware_raytracing_for_features(requested_features)) ?
+             BVH_LAYOUT_EMBREE :
+             BVH_LAYOUT_BVH2;
+}
+
+#  ifdef WITH_EMBREE_GPU
+void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
+{
+  if (embree_device && bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
+    BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
+    if (refit) {
+      bvh_embree->refit(progress);
+    }
+    else {
+      bvh_embree->build(progress, &stats, embree_device, true);
+    }
+    if (bvh->params.top_level) {
+      embree_scene = bvh_embree->scene;
+    }
+  }
+  else {
+    Device::build_bvh(bvh, progress, refit);
+  }
+}
+#  endif
+
 bool OneapiDevice::load_kernels(const uint requested_features)
 {
  assert(device_queue_);

+  kernel_features = requested_features;
+
  bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
  if (is_finished_ok == false) {
    set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ +
@@ -100,7 +172,14 @@ bool OneapiDevice::load_kernels(const uint requested_features)
    assert(device_queue_);
  }

-  is_finished_ok = oneapi_load_kernels(device_queue_, (const unsigned int)requested_features);
+  if (use_hardware_raytracing && !can_use_hardware_raytracing_for_features(requested_features)) {
+    VLOG_INFO
+        << "Hardware ray tracing disabled, not supported yet by oneAPI for requested features.";
+    use_hardware_raytracing = false;
+  }
+
+  is_finished_ok = oneapi_load_kernels(
+      device_queue_, (const unsigned int)requested_features, use_hardware_raytracing);
  if (is_finished_ok == false) {
    set_error("oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ + "\"");
  }
@@ -327,6 +406,16 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
             << string_human_readable_number(size) << " bytes. ("
             << string_human_readable_size(size) << ")";

+#  ifdef WITH_EMBREE_GPU
+  if (strcmp(name, "data") == 0) {
+    assert(size <= sizeof(KernelData));
+
+    /* Update scene handle(since it is different for each device on multi devices) */
+    KernelData *const data = (KernelData *)host;
+    data->device_bvh = embree_scene;
+  }
+#  endif
+
  ConstMemMap::iterator i = const_mem_map_.find(name);
  device_vector<uchar> *data;

@@ -446,7 +535,9 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
 #  endif
 }

-bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index)
+bool OneapiDevice::create_queue(SyclQueue *&external_queue,
+                                int device_index,
+                                void *embree_device_pointer)
 {
  bool finished_correct = true;
  try {
@@ -457,6 +548,13 @@ bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index)
    sycl::queue *created_queue = new sycl::queue(devices[device_index],
                                                 sycl::property::queue::in_order());
    external_queue = reinterpret_cast<SyclQueue *>(created_queue);
+#  ifdef WITH_EMBREE_GPU
+    if (embree_device_pointer) {
+      *((RTCDevice *)embree_device_pointer) = rtcNewSYCLDevice(created_queue->get_context(), "");
+    }
+#  else
+    (void)embree_device_pointer;
+#  endif
  }
  catch (sycl::exception const &e) {
    finished_correct = false;
@@ -625,7 +723,8 @@ bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
                                  size_t global_size,
                                  void **args)
 {
-  return oneapi_enqueue_kernel(kernel_context, kernel, global_size, args);
+  return oneapi_enqueue_kernel(
+      kernel_context, kernel, global_size, kernel_features, use_hardware_raytracing, args);
 }

 /* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
@@ -767,9 +866,9 @@ char *OneapiDevice::device_capabilities()

    sycl::id<3> max_work_item_sizes =
        device.get_info<sycl::info::device::max_work_item_sizes<3>>();
-    WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
-    WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
-    WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
+    WRITE_ATTR(max_work_item_sizes_dim0, ((size_t)max_work_item_sizes.get(0)))
+    WRITE_ATTR(max_work_item_sizes_dim1, ((size_t)max_work_item_sizes.get(1)))
+    WRITE_ATTR(max_work_item_sizes_dim2, ((size_t)max_work_item_sizes.get(2)))

    GET_NUM_ATTR(max_work_group_size)
    GET_NUM_ATTR(max_num_sub_groups)
@@ -792,7 +891,7 @@ char *OneapiDevice::device_capabilities()
    GET_NUM_ATTR(native_vector_width_half)

    size_t max_clock_frequency = device.get_info<sycl::info::device::max_clock_frequency>();
-    WRITE_ATTR("max_clock_frequency", max_clock_frequency)
+    WRITE_ATTR(max_clock_frequency, max_clock_frequency)

    GET_NUM_ATTR(address_bits)
    GET_NUM_ATTR(max_mem_alloc_size)
@@ -801,7 +900,7 @@ char *OneapiDevice::device_capabilities()
     * supported so we always return false, even if device supports HW texture usage acceleration.
     */
    bool image_support = false;
-    WRITE_ATTR("image_support", (size_t)image_support)
+    WRITE_ATTR(image_support, (size_t)image_support)

    GET_NUM_ATTR(max_parameter_size)
    GET_NUM_ATTR(mem_base_addr_align)
@@ -830,12 +929,17 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
    std::string name = device.get_info<sycl::info::device::name>();
 #  else
    std::string name = "SYCL Host Task (Debug)";
+#  endif
+#  ifdef WITH_EMBREE_GPU
+    bool hwrt_support = rtcIsSYCLDeviceSupported(device);
+#  else
+    bool hwrt_support = false;
 #  endif
    std::string id = "ONEAPI_" + platform_name + "_" + name;
    if (device.has(sycl::aspect::ext_intel_pci_address)) {
      id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
    }
-    (cb)(id.c_str(), name.c_str(), num, user_ptr);
+    (cb)(id.c_str(), name.c_str(), num, hwrt_support, user_ptr);
    num++;
  }
 }
--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@@ -16,15 +16,16 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
-                                             const char *name,
-                                             int num,
-                                             void *user_ptr);
+typedef void (*OneAPIDeviceIteratorCallback)(
+    const char *id, const char *name, int num, bool hwrt_support, void *user_ptr);

 class OneapiDevice : public Device {
 private:
  SyclQueue *device_queue_;
-
+#  ifdef WITH_EMBREE_GPU
+  RTCDevice embree_device;
+  RTCScene embree_scene;
+#  endif
  using ConstMemMap = map<string, device_vector<uchar> *>;
  ConstMemMap const_mem_map_;
  device_vector<TextureInfo> texture_info_;
@@ -34,17 +35,21 @@ class OneapiDevice : public Device {
  size_t kg_memory_size_ = (size_t)0;
  size_t max_memory_on_device_ = (size_t)0;
  std::string oneapi_error_string_;
+  bool use_hardware_raytracing = false;
+  unsigned int kernel_features = 0;

 public:
-  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+  virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override;

  OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);

  virtual ~OneapiDevice();
-
+#  ifdef WITH_EMBREE_GPU
+  void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
+#  endif
  bool check_peer_access(Device *peer_device) override;

-  bool load_kernels(const uint requested_features) override;
+  bool load_kernels(const uint kernel_features) override;

  void load_texture_info();

@@ -113,8 +118,9 @@ class OneapiDevice : public Device {
  SyclQueue *sycl_queue();

 protected:
+  bool can_use_hardware_raytracing_for_features(uint kernel_features) const;
  void check_usm(SyclQueue *queue, const void *usm_ptr, bool allow_host);
-  bool create_queue(SyclQueue *&external_queue, int device_index);
+  bool create_queue(SyclQueue *&external_queue, int device_index, void *embree_device);
  void free_queue(SyclQueue *queue);
  void *usm_aligned_alloc_host(SyclQueue *queue, size_t memory_size, size_t alignment);
  void *usm_alloc_device(SyclQueue *queue, size_t memory_size);
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -151,7 +151,7 @@ unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
  return make_unique<OptiXDeviceQueue>(this);
 }

-BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
+BVHLayoutMask OptiXDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
 {
  /* OptiX has its own internal acceleration structure format. */
  return BVH_LAYOUT_OPTIX;
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -88,7 +88,7 @@ class OptiXDevice : public CUDADevice {
  OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
  ~OptiXDevice();

-  BVHLayoutMask get_bvh_layout_mask() const override;
+  BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override;

  string compile_kernel_get_common_cflags(const uint kernel_features);

--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -574,7 +574,7 @@ void PathTrace::denoise(const RenderWork &render_work)

 void PathTrace::set_output_driver(unique_ptr<OutputDriver> driver)
 {
-  output_driver_ = move(driver);
+  output_driver_ = std::move(driver);
 }

 void PathTrace::set_display_driver(unique_ptr<DisplayDriver> driver)
@@ -585,7 +585,7 @@ void PathTrace::set_display_driver(unique_ptr<DisplayDriver> driver)
  destroy_gpu_resources();

  if (driver) {
-    display_ = make_unique<PathTraceDisplay>(move(driver));
+    display_ = make_unique<PathTraceDisplay>(std::move(driver));
  }
  else {
    display_ = nullptr;
--- a/intern/cycles/integrator/path_trace_display.cpp
+++ b/intern/cycles/integrator/path_trace_display.cpp
@@ -9,7 +9,9 @@

 CCL_NAMESPACE_BEGIN

-PathTraceDisplay::PathTraceDisplay(unique_ptr<DisplayDriver> driver) : driver_(move(driver)) {}
+PathTraceDisplay::PathTraceDisplay(unique_ptr<DisplayDriver> driver) : driver_(std::move(driver))
+{
+}

 void PathTraceDisplay::reset(const BufferParams &buffer_params, const bool reset_rendering)
 {
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -357,8 +357,12 @@ void PathTraceWorkCPU::guiding_push_sample_data_to_global_storage(
 #  if PATH_GUIDING_LEVEL >= 2
  const bool use_direct_light = kernel_data.integrator.use_guiding_direct_light;
  const bool use_mis_weights = kernel_data.integrator.use_guiding_mis_weights;
+#    if OPENPGL_VERSION_MINOR >= 5
+  kg->opgl_path_segment_storage->PrepareSamples(use_mis_weights, use_direct_light, false);
+#    else
  kg->opgl_path_segment_storage->PrepareSamples(
      false, nullptr, use_mis_weights, use_direct_light, false);
+#    endif
 #  endif

 #  ifdef WITH_CYCLES_DEBUG
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -28,6 +28,7 @@ static size_t estimate_single_state_size(const uint kernel_features)
 #define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
  state_size += (kernel_features & (feature)) ? sizeof(type) : 0;
 #define KERNEL_STRUCT_END(name) \
+  (void)array_index; \
  break; \
  }
 #define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
@@ -139,6 +140,7 @@ void PathTraceWorkGPU::alloc_integrator_soa()
    integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
  }
 #define KERNEL_STRUCT_END(name) \
+  (void)array_index; \
  break; \
  }
 #define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
@@ -299,8 +301,8 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
   * become busy after adding new tiles). This is especially important for the shadow catcher which
   * schedules work in halves of available number of paths. */
  work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
-  work_tile_scheduler_.set_accelerated_rt((device_->get_bvh_layout_mask() & BVH_LAYOUT_OPTIX) !=
-                                          0);
+  work_tile_scheduler_.set_accelerated_rt(
+      (device_->get_bvh_layout_mask(device_scene_->data.kernel_features) & BVH_LAYOUT_OPTIX) != 0);
  work_tile_scheduler_.reset(effective_buffer_params_,
                             start_sample,
                             samples_num,
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -55,21 +55,29 @@ void WorkTileScheduler::reset_scheduler_state()

  VLOG_WORK << "Will schedule tiles of size " << tile_size_;

-  if (VLOG_IS_ON(3)) {
-    /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling
-     * and purely focusing on the number of used path states. */
-    const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
-                                        tile_size_.num_samples;
-    const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
-    VLOG_WORK << "Number of unused path states: "
-              << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+  const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
+                                      tile_size_.num_samples;
+
+  if (num_path_states_in_tile == 0) {
+    num_tiles_x_ = 0;
+    num_tiles_y_ = 0;
+    num_tiles_per_sample_range_ = 0;
+  }
+  else {
+    if (VLOG_IS_ON(3)) {
+      /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile
+       * scheduling and purely focusing on the number of used path states. */
+      const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
+      VLOG_WORK << "Number of unused path states: "
+                << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+    }
+
+    num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
+    num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
+    num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);
  }

-  num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
-  num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
-
  total_tiles_num_ = num_tiles_x_ * num_tiles_y_;
-  num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);

  next_work_index_ = 0;
  total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_;
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -96,10 +96,13 @@ set(SRC_KERNEL_DEVICE_ONEAPI_HEADERS
  device/oneapi/compat.h
  device/oneapi/context_begin.h
  device/oneapi/context_end.h
+  device/oneapi/context_intersect_begin.h
+  device/oneapi/context_intersect_end.h
  device/oneapi/globals.h
  device/oneapi/image.h
  device/oneapi/kernel.h
  device/oneapi/kernel_templates.h
+  device/cpu/bvh.h
 )

 set(SRC_KERNEL_CLOSURE_HEADERS
@@ -764,7 +767,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)

  # Set defaults for spir64 and spir64_gen options
  if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
-    set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
+    set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-regular-grf-kernel integrator_intersect -ze-opt-large-grf-kernel shade -ze-opt-no-local-to-generic'")
  endif()
  if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen)
    set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}" CACHE STRING "Extra build options for spir64_gen target")
@@ -775,8 +778,6 @@ if(WITH_CYCLES_DEVICE_ONEAPI)

  # Host execution won't use GPU binaries, no need to compile them.
  if(WITH_CYCLES_ONEAPI_BINARIES AND NOT WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
-    # AoT binaries aren't currently reused when calling sycl::build.
-    list(APPEND sycl_compiler_flags -DSYCL_SKIP_KERNELS_PRELOAD)
    # Iterate over all targest and their options
    list(JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string)
    list(APPEND sycl_compiler_flags -fsycl-targets=${targets_string})
@@ -798,6 +799,59 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
      -I"${NANOVDB_INCLUDE_DIR}")
  endif()

+  if(WITH_CYCLES_EMBREE AND EMBREE_SYCL_SUPPORT)
+    list(APPEND sycl_compiler_flags
+      -DWITH_EMBREE
+      -DWITH_EMBREE_GPU
+      -DEMBREE_MAJOR_VERSION=${EMBREE_MAJOR_VERSION}
+      -I"${EMBREE_INCLUDE_DIRS}")
+
+    if(WIN32)
+      list(APPEND sycl_compiler_flags
+        -ladvapi32.lib
+      )
+    endif()
+
+    set(next_library_mode "")
+    foreach(library ${EMBREE_LIBRARIES})
+      string(TOLOWER "${library}" library_lower)
+      if(("${library_lower}" STREQUAL "optimized") OR
+         ("${library_lower}" STREQUAL "debug"))
+        set(next_library_mode "${library_lower}")
+      else()
+        if(next_library_mode STREQUAL "")
+          list(APPEND EMBREE_TBB_LIBRARIES_optimized ${library})
+          list(APPEND EMBREE_TBB_LIBRARIES_debug ${library})
+        else()
+          list(APPEND EMBREE_TBB_LIBRARIES_${next_library_mode} ${library})
+        endif()
+        set(next_library_mode "")
+      endif()
+    endforeach()
+
+    foreach(library ${TBB_LIBRARIES})
+      string(TOLOWER "${library}" library_lower)
+      if(("${library_lower}" STREQUAL "optimized") OR
+         ("${library_lower}" STREQUAL "debug"))
+        set(next_library_mode "${library_lower}")
+      else()
+        if(next_library_mode STREQUAL "")
+          list(APPEND EMBREE_TBB_LIBRARIES_optimized ${library})
+          list(APPEND EMBREE_TBB_LIBRARIES_debug ${library})
+        else()
+          list(APPEND EMBREE_TBB_LIBRARIES_${next_library_mode} ${library})
+        endif()
+        set(next_library_mode "")
+      endif()
+    endforeach()
+      list(APPEND sycl_compiler_flags
+        "$<$<CONFIG:Release>:${EMBREE_TBB_LIBRARIES_optimized}>"
+        "$<$<CONFIG:RelWithDebInfo>:${EMBREE_TBB_LIBRARIES_optimized}>"
+        "$<$<CONFIG:MinSizeRel>:${EMBREE_TBB_LIBRARIES_optimized}>"
+        "$<$<CONFIG:Debug>:${EMBREE_TBB_LIBRARIES_debug}>"
+      )
+  endif()
+
  if(WITH_CYCLES_DEBUG)
    list(APPEND sycl_compiler_flags -DWITH_CYCLES_DEBUG)
  endif()
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -21,6 +21,28 @@
 #  define __BVH2__
 #endif

+#if defined(__KERNEL_ONEAPI__) && defined(WITH_EMBREE_GPU)
+/* bool is apparently not tested for specialization constants:
+ * https://github.com/intel/llvm/blob/39d1c65272a786b2b13a6f094facfddf9408406d/sycl/test/basic_tests/SYCL-2020-spec-constants.cpp#L25-L27
+ * Instead of adding one more bool specialization constant, we reuse existing embree_features one
+ * and use RTC_FEATURE_FLAG_NONE as value to test for avoiding to call Embree on GPU.
+ */
+/* We set it to RTC_FEATURE_FLAG_NONE by default so AoT binaries contain MNE and ray-trace kernels
+ * pre-compiled without Embree.
+ * Changing this default value would require updating the logic in oneapi_load_kernels(). */
+static constexpr sycl::specialization_id<RTCFeatureFlags> oneapi_embree_features{
+    RTC_FEATURE_FLAG_NONE};
+#  define IF_USING_EMBREE \
+    if (kernel_handler.get_specialization_constant<oneapi_embree_features>() != \
+        RTC_FEATURE_FLAG_NONE)
+#  define IF_NOT_USING_EMBREE \
+    if (kernel_handler.get_specialization_constant<oneapi_embree_features>() == \
+        RTC_FEATURE_FLAG_NONE)
+#else
+#  define IF_USING_EMBREE
+#  define IF_NOT_USING_EMBREE
+#endif
+
 CCL_NAMESPACE_BEGIN

 #ifdef __BVH2__
@@ -74,30 +96,39 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
  }

 #  ifdef __EMBREE__
-  if (kernel_data.device_bvh) {
-    return kernel_embree_intersect(kg, ray, visibility, isect);
+  IF_USING_EMBREE
+  {
+    if (kernel_data.device_bvh) {
+      return kernel_embree_intersect(kg, ray, visibility, isect);
+    }
  }
 #  endif

+  IF_NOT_USING_EMBREE
+  {
 #  ifdef __OBJECT_MOTION__
-  if (kernel_data.bvh.have_motion) {
+    if (kernel_data.bvh.have_motion) {
 #    ifdef __HAIR__
-    if (kernel_data.bvh.have_curves) {
-      return bvh_intersect_hair_motion(kg, ray, isect, visibility);
-    }
+      if (kernel_data.bvh.have_curves) {
+        return bvh_intersect_hair_motion(kg, ray, isect, visibility);
+      }
 #    endif /* __HAIR__ */

-    return bvh_intersect_motion(kg, ray, isect, visibility);
-  }
+      return bvh_intersect_motion(kg, ray, isect, visibility);
+    }
 #  endif /* __OBJECT_MOTION__ */

 #  ifdef __HAIR__
-  if (kernel_data.bvh.have_curves) {
-    return bvh_intersect_hair(kg, ray, isect, visibility);
-  }
+    if (kernel_data.bvh.have_curves) {
+      return bvh_intersect_hair(kg, ray, isect, visibility);
+    }
 #  endif /* __HAIR__ */

-  return bvh_intersect(kg, ray, isect, visibility);
+    return bvh_intersect(kg, ray, isect, visibility);
+  }
+
+  kernel_assert(false);
+  return false;
 }

 /* Single object BVH traversal, for SSS/AO/bevel. */
@@ -129,17 +160,27 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
  }

 #    ifdef __EMBREE__
-  if (kernel_data.device_bvh) {
-    return kernel_embree_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
+  IF_USING_EMBREE
+  {
+    if (kernel_data.device_bvh) {
+      return kernel_embree_intersect_local(
+          kg, ray, local_isect, local_object, lcg_state, max_hits);
+    }
  }
 #    endif

+  IF_NOT_USING_EMBREE
+  {
 #    ifdef __OBJECT_MOTION__
-  if (kernel_data.bvh.have_motion) {
-    return bvh_intersect_local_motion(kg, ray, local_isect, local_object, lcg_state, max_hits);
-  }
+    if (kernel_data.bvh.have_motion) {
+      return bvh_intersect_local_motion(kg, ray, local_isect, local_object, lcg_state, max_hits);
+    }
 #    endif /* __OBJECT_MOTION__ */
-  return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
+    return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
+  }
+
+  kernel_assert(false);
+  return false;
 }
 #  endif

@@ -184,35 +225,44 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
  }

 #    ifdef __EMBREE__
-  if (kernel_data.device_bvh) {
-    return kernel_embree_intersect_shadow_all(
-        kg, state, ray, visibility, max_hits, num_recorded_hits, throughput);
+  IF_USING_EMBREE
+  {
+    if (kernel_data.device_bvh) {
+      return kernel_embree_intersect_shadow_all(
+          kg, state, ray, visibility, max_hits, num_recorded_hits, throughput);
+    }
  }
 #    endif

+  IF_NOT_USING_EMBREE
+  {
 #    ifdef __OBJECT_MOTION__
-  if (kernel_data.bvh.have_motion) {
+    if (kernel_data.bvh.have_motion) {
 #      ifdef __HAIR__
-    if (kernel_data.bvh.have_curves) {
-      return bvh_intersect_shadow_all_hair_motion(
-          kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
-    }
+      if (kernel_data.bvh.have_curves) {
+        return bvh_intersect_shadow_all_hair_motion(
+            kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
+      }
 #      endif /* __HAIR__ */

-    return bvh_intersect_shadow_all_motion(
-        kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
-  }
+      return bvh_intersect_shadow_all_motion(
+          kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
+    }
 #    endif /* __OBJECT_MOTION__ */

 #    ifdef __HAIR__
-  if (kernel_data.bvh.have_curves) {
-    return bvh_intersect_shadow_all_hair(
-        kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
-  }
+    if (kernel_data.bvh.have_curves) {
+      return bvh_intersect_shadow_all_hair(
+          kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
+    }
 #    endif /* __HAIR__ */

-  return bvh_intersect_shadow_all(
-      kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
+    return bvh_intersect_shadow_all(
+        kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
+  }
+
+  kernel_assert(false);
+  return false;
 }
 #  endif /* __SHADOW_RECORD_ALL__ */

@@ -239,13 +289,28 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
    return false;
  }

-#    ifdef __OBJECT_MOTION__
-  if (kernel_data.bvh.have_motion) {
-    return bvh_intersect_volume_motion(kg, ray, isect, visibility);
+#    ifdef __EMBREE__
+  IF_USING_EMBREE
+  {
+    if (kernel_data.device_bvh) {
+      return kernel_embree_intersect_volume(kg, ray, isect, visibility);
+    }
  }
+#    endif
+
+  IF_NOT_USING_EMBREE
+  {
+#    ifdef __OBJECT_MOTION__
+    if (kernel_data.bvh.have_motion) {
+      return bvh_intersect_volume_motion(kg, ray, isect, visibility);
+    }
 #    endif /* __OBJECT_MOTION__ */

-  return bvh_intersect_volume(kg, ray, isect, visibility);
+    return bvh_intersect_volume(kg, ray, isect, visibility);
+  }
+
+  kernel_assert(false);
+  return false;
 }
 #  endif /* defined(__VOLUME__) && !defined(__VOLUME_RECORD_ALL__) */

@@ -275,18 +340,27 @@ ccl_device_intersect uint scene_intersect_volume(KernelGlobals kg,
  }

 #    ifdef __EMBREE__
-  if (kernel_data.device_bvh) {
-    return kernel_embree_intersect_volume(kg, ray, isect, max_hits, visibility);
+  IF_USING_EMBREE
+  {
+    if (kernel_data.device_bvh) {
+      return kernel_embree_intersect_volume(kg, ray, isect, max_hits, visibility);
+    }
  }
 #    endif

+  IF_NOT_USING_EMBREE
+  {
 #    ifdef __OBJECT_MOTION__
-  if (kernel_data.bvh.have_motion) {
-    return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
-  }
+    if (kernel_data.bvh.have_motion) {
+      return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
+    }
 #    endif /* __OBJECT_MOTION__ */

-  return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
+    return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
+  }
+
+  kernel_assert(false);
+  return false;
 }

 #  endif /* defined(__VOLUME__) && defined(__VOLUME_RECORD_ALL__) */
--- a/intern/cycles/kernel/bvh/volume_all.h
+++ b/intern/cycles/kernel/bvh/volume_all.h
@@ -51,8 +51,6 @@ ccl_device_inline
  int object = OBJECT_NONE;
  float isect_t = ray->tmax;

-  int num_hits_in_instance = 0;
-
  uint num_hits = 0;
  isect_array->t = ray->tmax;

@@ -152,7 +150,6 @@ ccl_device_inline
                  /* Move on to next entry in intersections array. */
                  isect_array++;
                  num_hits++;
-                  num_hits_in_instance++;
                  isect_array->t = isect_t;
                  if (num_hits == max_hits) {
                    return num_hits;
@@ -193,7 +190,6 @@ ccl_device_inline
                  /* Move on to next entry in intersections array. */
                  isect_array++;
                  num_hits++;
-                  num_hits_in_instance++;
                  isect_array->t = isect_t;
                  if (num_hits == max_hits) {
                    return num_hits;
@@ -219,7 +215,6 @@ ccl_device_inline
            bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif

-            num_hits_in_instance = 0;
            isect_array->t = isect_t;

            ++stack_ptr;
--- a/intern/cycles/kernel/data_arrays.h
+++ b/intern/cycles/kernel/data_arrays.h
@@ -64,6 +64,7 @@ KERNEL_DATA_ARRAY(float2, light_background_conditional_cdf)
 KERNEL_DATA_ARRAY(KernelLightTreeNode, light_tree_nodes)
 KERNEL_DATA_ARRAY(KernelLightTreeEmitter, light_tree_emitters)
 KERNEL_DATA_ARRAY(uint, light_to_tree)
+KERNEL_DATA_ARRAY(uint, object_to_tree)
 KERNEL_DATA_ARRAY(uint, object_lookup_offset)
 KERNEL_DATA_ARRAY(uint, triangle_to_tree)

--- a/intern/cycles/kernel/data_template.h
+++ b/intern/cycles/kernel/data_template.h
@@ -20,6 +20,7 @@ KERNEL_STRUCT_BEGIN(KernelBackground, background)
 /* xyz store direction, w the angle. float4 instead of float3 is used
 * to ensure consistent padding/alignment across devices. */
 KERNEL_STRUCT_MEMBER(background, float4, sun)
+KERNEL_STRUCT_MEMBER(background, int, use_sun_guiding)
 /* Only shader index. */
 KERNEL_STRUCT_MEMBER(background, int, surface_shader)
 KERNEL_STRUCT_MEMBER(background, int, volume_shader)
@@ -39,6 +40,10 @@ KERNEL_STRUCT_MEMBER(background, int, use_mis)
 KERNEL_STRUCT_MEMBER(background, int, lightgroup)
 /* Light Index. */
 KERNEL_STRUCT_MEMBER(background, int, light_index)
+/* Padding. */
+KERNEL_STRUCT_MEMBER(background, int, pad1)
+KERNEL_STRUCT_MEMBER(background, int, pad2)
+KERNEL_STRUCT_MEMBER(background, int, pad3)
 KERNEL_STRUCT_END(KernelBackground)

 /* BVH: own BVH2 if no native device acceleration struct used. */
--- a/intern/cycles/kernel/device/cpu/bvh.h
+++ b/intern/cycles/kernel/device/cpu/bvh.h
@@ -13,8 +13,13 @@
 #  include <embree3/rtcore_scene.h>
 #endif

-#include "kernel/device/cpu/compat.h"
-#include "kernel/device/cpu/globals.h"
+#ifdef __KERNEL_ONEAPI__
+#  include "kernel/device/oneapi/compat.h"
+#  include "kernel/device/oneapi/globals.h"
+#else
+#  include "kernel/device/cpu/compat.h"
+#  include "kernel/device/cpu/globals.h"
+#endif

 #include "kernel/bvh/types.h"
 #include "kernel/bvh/util.h"
@@ -33,11 +38,16 @@ using numhit_t = uint8_t;
 using numhit_t = uint32_t;
 #endif

-#define CYCLES_EMBREE_USED_FEATURES \
-  (RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE | \
-                    RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS | RTC_FEATURE_FLAG_POINT | \
-                    RTC_FEATURE_FLAG_MOTION_BLUR | RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE | \
-                    RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE)
+#ifdef __KERNEL_ONEAPI__
+#  define CYCLES_EMBREE_USED_FEATURES \
+    (kernel_handler.get_specialization_constant<oneapi_embree_features>())
+#else
+#  define CYCLES_EMBREE_USED_FEATURES \
+    (RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE | \
+                      RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS | RTC_FEATURE_FLAG_POINT | \
+                      RTC_FEATURE_FLAG_MOTION_BLUR | RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE | \
+                      RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE)
+#endif

 #define EMBREE_IS_HAIR(x) (x & 1)

@@ -99,7 +109,9 @@ struct CCLVolumeContext
 #if EMBREE_MAJOR_VERSION >= 4
  KernelGlobals kg;
  const Ray *ray;
+#  ifdef __VOLUME_RECORD_ALL__
  numhit_t max_hits;
+#  endif
  numhit_t num_hits;
 #endif
  Intersection *vol_isect;
@@ -252,7 +264,8 @@ ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals kg,
 * Things like recording subsurface or shadow hits for later evaluation
 * as well as filtering for volume objects happen here.
 * Cycles' own BVH does that directly inside the traversal calls. */
-ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNArguments *args)
+ccl_device_forceinline void kernel_embree_filter_intersection_func_impl(
+    const RTCFilterFunctionNArguments *args)
 {
  /* Current implementation in Cycles assumes only single-ray intersection queries. */
  assert(args->N == 1);
@@ -263,7 +276,11 @@ ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNA
 #else
  CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
 #endif
+#ifdef __KERNEL_ONEAPI__
+  KernelGlobalsGPU *kg = nullptr;
+#else
  const KernelGlobalsCPU *kg = ctx->kg;
+#endif
  const Ray *cray = ctx->ray;

  if (kernel_embree_is_self_intersection(
@@ -277,7 +294,7 @@ ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNA
 * as well as filtering for volume objects happen here.
 * Cycles' own BVH does that directly inside the traversal calls.
 */
-ccl_device void kernel_embree_filter_occluded_shadow_all_func(
+ccl_device_forceinline void kernel_embree_filter_occluded_shadow_all_func_impl(
    const RTCFilterFunctionNArguments *args)
 {
  /* Current implementation in Cycles assumes only single-ray intersection queries. */
@@ -290,7 +307,11 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func(
 #else
  CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
 #endif
+#ifdef __KERNEL_ONEAPI__
+  KernelGlobalsGPU *kg = nullptr;
+#else
  const KernelGlobalsCPU *kg = ctx->kg;
+#endif
  const Ray *cray = ctx->ray;

  Intersection current_isect;
@@ -326,7 +347,7 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func(
  }

  /* Test if we need to record this transparent intersection. */
-  const numhit_t max_record_hits = min(ctx->max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
+  const numhit_t max_record_hits = min(ctx->max_hits, numhit_t(INTEGRATOR_SHADOW_ISECT_SIZE));
  if (ctx->num_recorded_hits < max_record_hits) {
    /* If maximum number of hits was reached, replace the intersection with the
     * highest distance. We want to find the N closest intersections. */
@@ -363,7 +384,7 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func(
  *args->valid = 0;
 }

-ccl_device_forceinline void kernel_embree_filter_occluded_local_func(
+ccl_device_forceinline void kernel_embree_filter_occluded_local_func_impl(
    const RTCFilterFunctionNArguments *args)
 {
  /* Current implementation in Cycles assumes only single-ray intersection queries. */
@@ -376,7 +397,11 @@ ccl_device_forceinline void kernel_embree_filter_occluded_local_func(
 #else
  CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
 #endif
+#ifdef __KERNEL_ONEAPI__
+  KernelGlobalsGPU *kg = nullptr;
+#else
  const KernelGlobalsCPU *kg = ctx->kg;
+#endif
  const Ray *cray = ctx->ray;

  /* Check if it's hitting the correct object. */
@@ -462,7 +487,7 @@ ccl_device_forceinline void kernel_embree_filter_occluded_local_func(
  *args->valid = 0;
 }

-ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func(
+ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func_impl(
    const RTCFilterFunctionNArguments *args)
 {
  /* Current implementation in Cycles assumes only single-ray intersection queries. */
@@ -475,11 +500,17 @@ ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func(
 #else
  CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
 #endif
+#ifdef __KERNEL_ONEAPI__
+  KernelGlobalsGPU *kg = nullptr;
+#else
  const KernelGlobalsCPU *kg = ctx->kg;
+#endif
  const Ray *cray = ctx->ray;

+#ifdef __VOLUME_RECORD_ALL__
  /* Append the intersection to the end of the array. */
  if (ctx->num_hits < ctx->max_hits) {
+#endif
    Intersection current_isect;
    kernel_embree_convert_hit(
        kg, ray, hit, &current_isect, reinterpret_cast<intptr_t>(args->geometryUserPtr));
@@ -496,10 +527,17 @@ ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func(
    int object_flag = kernel_data_fetch(object_flag, tri_object);
    if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
      --ctx->num_hits;
+#ifndef __VOLUME_RECORD_ALL__
+      /* Without __VOLUME_RECORD_ALL__ we need only a first counted hit, so we will
+       * continue tracing only if a current hit is not counted. */
+      *args->valid = 0;
+#endif
    }
+#ifdef __VOLUME_RECORD_ALL__
    /* This tells Embree to continue tracing. */
    *args->valid = 0;
  }
+#endif
 }

 #if EMBREE_MAJOR_VERSION < 4
@@ -513,14 +551,14 @@ ccl_device_forceinline void kernel_embree_filter_occluded_func(

  switch (ctx->type) {
    case CCLIntersectContext::RAY_SHADOW_ALL:
-      kernel_embree_filter_occluded_shadow_all_func(args);
+      kernel_embree_filter_occluded_shadow_all_func_impl(args);
      break;
    case CCLIntersectContext::RAY_LOCAL:
    case CCLIntersectContext::RAY_SSS:
-      kernel_embree_filter_occluded_local_func(args);
+      kernel_embree_filter_occluded_local_func_impl(args);
      break;
    case CCLIntersectContext::RAY_VOLUME_ALL:
-      kernel_embree_filter_occluded_volume_all_func(args);
+      kernel_embree_filter_occluded_volume_all_func_impl(args);
      break;

    case CCLIntersectContext::RAY_REGULAR:
@@ -569,7 +607,63 @@ ccl_device void kernel_embree_filter_occluded_func_backface_cull(

  kernel_embree_filter_occluded_func(args);
 }
+#endif

+#ifdef __KERNEL_ONEAPI__
+/* Static wrappers so we can call the callbacks from out side the ONEAPIKernelContext class */
+RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
+kernel_embree_filter_intersection_func_static(const RTCFilterFunctionNArguments *args)
+{
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLFirstHitContext *ctx = (CCLFirstHitContext *)(args->context);
+  ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
+  context->kernel_embree_filter_intersection_func_impl(args);
+}
+
+RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
+kernel_embree_filter_occluded_shadow_all_func_static(const RTCFilterFunctionNArguments *args)
+{
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLShadowContext *ctx = (CCLShadowContext *)(args->context);
+  ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
+  context->kernel_embree_filter_occluded_shadow_all_func_impl(args);
+}
+
+RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
+kernel_embree_filter_occluded_local_func_static(const RTCFilterFunctionNArguments *args)
+{
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLLocalContext *ctx = (CCLLocalContext *)(args->context);
+  ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
+  context->kernel_embree_filter_occluded_local_func_impl(args);
+}
+
+RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
+kernel_embree_filter_occluded_volume_all_func_static(const RTCFilterFunctionNArguments *args)
+{
+  RTCHit *hit = (RTCHit *)args->hit;
+  CCLVolumeContext *ctx = (CCLVolumeContext *)(args->context);
+  ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
+  context->kernel_embree_filter_occluded_volume_all_func_impl(args);
+}
+
+#  define kernel_embree_filter_intersection_func \
+    ONEAPIKernelContext::kernel_embree_filter_intersection_func_static
+#  define kernel_embree_filter_occluded_shadow_all_func \
+    ONEAPIKernelContext::kernel_embree_filter_occluded_shadow_all_func_static
+#  define kernel_embree_filter_occluded_local_func \
+    ONEAPIKernelContext::kernel_embree_filter_occluded_local_func_static
+#  define kernel_embree_filter_occluded_volume_all_func \
+    ONEAPIKernelContext::kernel_embree_filter_occluded_volume_all_func_static
+#else
+#  define kernel_embree_filter_intersection_func kernel_embree_filter_intersection_func_impl
+#  if EMBREE_MAJOR_VERSION >= 4
+#    define kernel_embree_filter_occluded_shadow_all_func \
+      kernel_embree_filter_occluded_shadow_all_func_impl
+#    define kernel_embree_filter_occluded_local_func kernel_embree_filter_occluded_local_func_impl
+#    define kernel_embree_filter_occluded_volume_all_func \
+      kernel_embree_filter_occluded_volume_all_func_impl
+#  endif
 #endif

 /* Scene intersection. */
@@ -583,7 +677,15 @@ ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg,
 #if EMBREE_MAJOR_VERSION >= 4
  CCLFirstHitContext ctx;
  rtcInitRayQueryContext(&ctx);
+#  ifdef __KERNEL_ONEAPI__
+  /* NOTE(sirgienko): Cycles GPU back-ends passes NULL to KernelGlobals and
+   * uses global device allocation (CUDA, Optix, HIP) or passes all needed data
+   * as a class context (Metal, oneAPI). So we need to pass this context here
+   * in order to have an access to it later in Embree filter functions on GPU. */
+  ctx.kg = (KernelGlobals)this;
+#  else
  ctx.kg = kg;
+#  endif
 #else
  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
  rtcInitIntersectContext(&ctx);
@@ -596,7 +698,7 @@ ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg,
 #if EMBREE_MAJOR_VERSION >= 4
  RTCIntersectArguments args;
  rtcInitIntersectArguments(&args);
-  args.filter = (RTCFilterFunctionN)kernel_embree_filter_intersection_func;
+  args.filter = reinterpret_cast<RTCFilterFunctionN>(kernel_embree_filter_intersection_func);
  args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
  args.context = &ctx;
  rtcIntersect1(kernel_data.device_bvh, &ray_hit, &args);
@@ -625,7 +727,15 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
  CCLLocalContext ctx;
  rtcInitRayQueryContext(&ctx);
+#    ifdef __KERNEL_ONEAPI__
+  /* NOTE(sirgienko): Cycles GPU back-ends passes NULL to KernelGlobals and
+   * uses global device allocation (CUDA, Optix, HIP) or passes all needed data
+   * as a class context (Metal, oneAPI). So we need to pass this context here
+   * in order to have an access to it later in Embree filter functions on GPU. */
+  ctx.kg = (KernelGlobals)this;
+#    else
  ctx.kg = kg;
+#    endif
 #  else
  CCLIntersectContext ctx(kg,
                          has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL);
@@ -646,7 +756,7 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
  RTCOccludedArguments args;
  rtcInitOccludedArguments(&args);
-  args.filter = (RTCFilterFunctionN)(kernel_embree_filter_occluded_local_func);
+  args.filter = reinterpret_cast<RTCFilterFunctionN>(kernel_embree_filter_occluded_local_func);
  args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
  args.context = &ctx;
 #  endif
@@ -692,7 +802,7 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,

 #ifdef __SHADOW_RECORD_ALL__
 ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
-                                                             IntegratorShadowStateCPU *state,
+                                                             IntegratorShadowState state,
                                                             ccl_private const Ray *ray,
                                                             uint visibility,
                                                             uint max_hits,
@@ -702,7 +812,15 @@ ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
  CCLShadowContext ctx;
  rtcInitRayQueryContext(&ctx);
+#    ifdef __KERNEL_ONEAPI__
+  /* NOTE(sirgienko): Cycles GPU back-ends passes NULL to KernelGlobals and
+   * uses global device allocation (CUDA, Optix, HIP) or passes all needed data
+   * as a class context (Metal, oneAPI). So we need to pass this context here
+   * in order to have an access to it later in Embree filter functions on GPU. */
+  ctx.kg = (KernelGlobals)this;
+#    else
  ctx.kg = kg;
+#    endif
 #  else
  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
  rtcInitIntersectContext(&ctx);
@@ -718,7 +836,8 @@ ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
  RTCOccludedArguments args;
  rtcInitOccludedArguments(&args);
-  args.filter = (RTCFilterFunctionN)kernel_embree_filter_occluded_shadow_all_func;
+  args.filter = reinterpret_cast<RTCFilterFunctionN>(
+      kernel_embree_filter_occluded_shadow_all_func);
  args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
  args.context = &ctx;
  rtcOccluded1(kernel_data.device_bvh, &rtc_ray, &args);
@@ -736,19 +855,31 @@ ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
 ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg,
                                                         ccl_private const Ray *ray,
                                                         ccl_private Intersection *isect,
+#  ifdef __VOLUME_RECORD_ALL__
                                                         const uint max_hits,
+#  endif
                                                         const uint visibility)
 {
 #  if EMBREE_MAJOR_VERSION >= 4
  CCLVolumeContext ctx;
  rtcInitRayQueryContext(&ctx);
+#    ifdef __KERNEL_ONEAPI__
+  /* NOTE(sirgienko) Cycles GPU back-ends passes NULL to KernelGlobals and
+   * uses global device allocation (CUDA, Optix, HIP) or passes all needed data
+   * as a class context (Metal, oneAPI). So we need to pass this context here
+   * in order to have an access to it later in Embree filter functions on GPU. */
+  ctx.kg = (KernelGlobals)this;
+#    else
  ctx.kg = kg;
+#    endif
 #  else
  CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
  rtcInitIntersectContext(&ctx);
 #  endif
  ctx.vol_isect = isect;
+#  ifdef __VOLUME_RECORD_ALL__
  ctx.max_hits = numhit_t(max_hits);
+#  endif
  ctx.num_hits = numhit_t(0);
  ctx.ray = ray;
  RTCRay rtc_ray;
@@ -756,7 +887,8 @@ ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg,
 #  if EMBREE_MAJOR_VERSION >= 4
  RTCOccludedArguments args;
  rtcInitOccludedArguments(&args);
-  args.filter = (RTCFilterFunctionN)kernel_embree_filter_occluded_volume_all_func;
+  args.filter = reinterpret_cast<RTCFilterFunctionN>(
+      kernel_embree_filter_occluded_volume_all_func);
  args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
  args.context = &ctx;
  rtcOccluded1(kernel_data.device_bvh, &rtc_ray, &args);
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -128,6 +128,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 ccl_gpu_kernel_postfix

+/* Intersection kernels need access to the kernel handler for specialization constants to work
+ * properly. */
+#ifdef __KERNEL_ONEAPI__
+#  include "kernel/device/oneapi/context_intersect_begin.h"
+#endif
+
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    ccl_gpu_kernel_signature(integrator_intersect_closest,
                             ccl_global const int *path_index_array,
@@ -185,6 +191,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 }
 ccl_gpu_kernel_postfix

+#ifdef __KERNEL_ONEAPI__
+#  include "kernel/device/oneapi/context_intersect_end.h"
+#endif
+
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    ccl_gpu_kernel_signature(integrator_shade_background,
                             ccl_global const int *path_index_array,
@@ -249,6 +259,12 @@ ccl_gpu_kernel_postfix
 constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
 #endif

+/* Kernels using intersections need access to the kernel handler for specialization constants to
+ * work properly. */
+#ifdef __KERNEL_ONEAPI__
+#  include "kernel/device/oneapi/context_intersect_begin.h"
+#endif
+
 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
                             ccl_global const int *path_index_array,
@@ -287,6 +303,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
  }
 }
 ccl_gpu_kernel_postfix
+#ifdef __KERNEL_ONEAPI__
+#  include "kernel/device/oneapi/context_intersect_end.h"
+#endif

 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
    ccl_gpu_kernel_signature(integrator_shade_volume,
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -5,6 +5,11 @@

 #define __KERNEL_GPU__
 #define __KERNEL_ONEAPI__
+#define __KERNEL_64_BIT__
+
+#ifdef WITH_EMBREE_GPU
+#  define __KERNEL_GPU_RAYTRACING__
+#endif

 #define CCL_NAMESPACE_BEGIN
 #define CCL_NAMESPACE_END
@@ -57,17 +62,19 @@
 #define ccl_gpu_kernel_threads(block_num_threads)

 #ifndef WITH_ONEAPI_SYCL_HOST_TASK
-#  define ccl_gpu_kernel_signature(name, ...) \
+#  define __ccl_gpu_kernel_signature(name, ...) \
 void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
                          size_t kernel_global_size, \
                          size_t kernel_local_size, \
                          sycl::handler &cgh, \
                          __VA_ARGS__) { \
      (kg); \
-      cgh.parallel_for<class kernel_##name>( \
+      cgh.parallel_for( \
          sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
          [=](sycl::nd_item<1> item) {

+#  define ccl_gpu_kernel_signature __ccl_gpu_kernel_signature
+
 #  define ccl_gpu_kernel_postfix \
          }); \
    }
--- a/intern/cycles/kernel/device/oneapi/context_intersect_begin.h
+++ b/intern/cycles/kernel/device/oneapi/context_intersect_begin.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2023 Intel Corporation */
+
+#if !defined(WITH_ONEAPI_SYCL_HOST_TASK) && defined(WITH_EMBREE_GPU)
+#  undef ccl_gpu_kernel_signature
+#  define ccl_gpu_kernel_signature(name, ...) \
+    void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
+                              size_t kernel_global_size, \
+                              size_t kernel_local_size, \
+                              sycl::handler &cgh, \
+                              __VA_ARGS__) \
+    { \
+      (kg); \
+      cgh.parallel_for( \
+          sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
+          [=](sycl::nd_item<1> item, sycl::kernel_handler oneapi_kernel_handler) { \
+            ((ONEAPIKernelContext*)kg)->kernel_handler = oneapi_kernel_handler;
+#endif
--- a/intern/cycles/kernel/device/oneapi/context_intersect_end.h
+++ b/intern/cycles/kernel/device/oneapi/context_intersect_end.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2023 Intel Corporation */
+
+#if !defined(WITH_ONEAPI_SYCL_HOST_TASK) && defined(WITH_EMBREE_GPU)
+#  undef ccl_gpu_kernel_signature
+#  define ccl_gpu_kernel_signature __ccl_gpu_kernel_signature
+#endif
--- a/intern/cycles/kernel/device/oneapi/globals.h
+++ b/intern/cycles/kernel/device/oneapi/globals.h
@@ -31,6 +31,8 @@ typedef struct KernelGlobalsGPU {
  size_t nd_item_group_range_0;
  size_t nd_item_global_id_0;
  size_t nd_item_global_range_0;
+#else
+  sycl::kernel_handler kernel_handler;
 #endif
 } KernelGlobalsGPU;

--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -16,9 +16,22 @@

 #  include "kernel/device/gpu/kernel.h"

+#  include "device/kernel.cpp"
+
 static OneAPIErrorCallback s_error_cb = nullptr;
 static void *s_error_user_ptr = nullptr;

+#  ifdef WITH_EMBREE_GPU
+static const RTCFeatureFlags CYCLES_ONEAPI_EMBREE_BASIC_FEATURES =
+    (const RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE |
+                            RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS |
+                            RTC_FEATURE_FLAG_POINT | RTC_FEATURE_FLAG_MOTION_BLUR);
+static const RTCFeatureFlags CYCLES_ONEAPI_EMBREE_ALL_FEATURES =
+    (const RTCFeatureFlags)(CYCLES_ONEAPI_EMBREE_BASIC_FEATURES |
+                            RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE |
+                            RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE);
+#  endif
+
 void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
 {
  s_error_cb = cb;
@@ -142,15 +155,99 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
  return std::min(limit_work_group_size, preferred_work_group_size);
 }

-bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
+bool oneapi_kernel_is_required_for_features(const std::string &kernel_name,
+                                            const uint kernel_features)
+{
+  if ((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0 &&
+      kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE)) !=
+          std::string::npos)
+    return false;
+  if ((kernel_features & KERNEL_FEATURE_MNEE) == 0 &&
+      kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE)) !=
+          std::string::npos)
+    return false;
+  if ((kernel_features & KERNEL_FEATURE_VOLUME) == 0 &&
+      kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK)) !=
+          std::string::npos)
+    return false;
+
+  return true;
+}
+
+bool oneapi_kernel_is_raytrace_or_mnee(const std::string &kernel_name)
+{
+  return (kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE)) !=
+          std::string::npos) ||
+         (kernel_name.find(device_kernel_as_string(
+              DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE)) != std::string::npos);
+}
+
+bool oneapi_kernel_is_using_embree(const std::string &kernel_name)
+{
+#  ifdef WITH_EMBREE_GPU
+  /* MNEE and Ray-trace kernels aren't yet enabled to use Embree. */
+  for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+    DeviceKernel kernel = (DeviceKernel)i;
+    if (device_kernel_has_intersection(kernel)) {
+      if (kernel_name.find(device_kernel_as_string(kernel)) != std::string::npos) {
+        return !oneapi_kernel_is_raytrace_or_mnee(kernel_name);
+      }
+    }
+  }
+#  endif
+  return false;
+}
+
+bool oneapi_load_kernels(SyclQueue *queue_,
+                         const uint kernel_features,
+                         bool use_hardware_raytracing)
 {
-#  ifdef SYCL_SKIP_KERNELS_PRELOAD
-  (void)queue_;
-  (void)requested_features;
-#  else
  assert(queue_);
  sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);

+#  ifdef WITH_EMBREE_GPU
+  /* For best performance, we always JIT compile the kernels that are using Embree. */
+  if (use_hardware_raytracing) {
+    try {
+      sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
+          sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
+                                                             {queue->get_device()});
+
+      for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) {
+        const std::string &kernel_name = kernel_id.get_name();
+
+        if (!oneapi_kernel_is_required_for_features(kernel_name, kernel_features) ||
+            !oneapi_kernel_is_using_embree(kernel_name)) {
+          continue;
+        }
+
+        sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle_input =
+            sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
+
+        /* Hair requires embree curves support. */
+        if (kernel_features & KERNEL_FEATURE_HAIR) {
+          one_kernel_bundle_input
+              .set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
+                  CYCLES_ONEAPI_EMBREE_ALL_FEATURES);
+          sycl::build(one_kernel_bundle_input);
+        }
+        else {
+          one_kernel_bundle_input
+              .set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
+                  CYCLES_ONEAPI_EMBREE_BASIC_FEATURES);
+          sycl::build(one_kernel_bundle_input);
+        }
+      }
+    }
+    catch (sycl::exception const &e) {
+      if (s_error_cb) {
+        s_error_cb(e.what(), s_error_user_ptr);
+      }
+      return false;
+    }
+  }
+#  endif
+
  try {
    sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
        sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
@@ -159,27 +256,29 @@ bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
    for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) {
      const std::string &kernel_name = kernel_id.get_name();

-      /* NOTE(@nsirgien): Names in this conditions below should match names from
-       * oneapi_call macro in oneapi_enqueue_kernel below */
-      if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) &&
-          kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) {
+      /* In case HWRT is on, compilation of kernels using Embree is already handled in previous
+       * block. */
+      if (!oneapi_kernel_is_required_for_features(kernel_name, kernel_features) ||
+          (use_hardware_raytracing && oneapi_kernel_is_using_embree(kernel_name))) {
        continue;
      }

-      if (((requested_features & KERNEL_FEATURE_MNEE) == 0) &&
-          kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) {
+#  ifdef WITH_EMBREE_GPU
+      if (oneapi_kernel_is_using_embree(kernel_name) ||
+          oneapi_kernel_is_raytrace_or_mnee(kernel_name)) {
+        sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle_input =
+            sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
+        one_kernel_bundle_input
+            .set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
+                RTC_FEATURE_FLAG_NONE);
+        sycl::build(one_kernel_bundle_input);
        continue;
      }
-
-      if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) &&
-          kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") !=
-              std::string::npos) {
-        continue;
-      }
-
-      sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle =
-          sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
-      sycl::build(one_kernel_bundle);
+#  endif
+      /* This call will ensure that AoT or cached JIT binaries are available
+       * for execution. It will trigger compilation if it is not already the case. */
+      (void)sycl::get_kernel_bundle<sycl::bundle_state::executable>(queue->get_context(),
+                                                                    {kernel_id});
    }
  }
  catch (sycl::exception const &e) {
@@ -188,13 +287,14 @@ bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
    }
    return false;
  }
-#  endif
  return true;
 }

 bool oneapi_enqueue_kernel(KernelContext *kernel_context,
                           int kernel,
                           size_t global_size,
+                           const uint kernel_features,
+                           bool use_hardware_raytracing,
                           void **args)
 {
  bool success = true;
@@ -248,6 +348,21 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,

  try {
    queue->submit([&](sycl::handler &cgh) {
+#  ifdef WITH_EMBREE_GPU
+      /* Spec says it has no effect if the called kernel doesn't support the below specialization
+       * constant but it can still trigger a recompilation, so we set it only if needed. */
+      if (device_kernel_has_intersection(device_kernel)) {
+        const RTCFeatureFlags used_embree_features = !use_hardware_raytracing ?
+                                                         RTC_FEATURE_FLAG_NONE :
+                                                     !(kernel_features & KERNEL_FEATURE_HAIR) ?
+                                                         CYCLES_ONEAPI_EMBREE_BASIC_FEATURES :
+                                                         CYCLES_ONEAPI_EMBREE_ALL_FEATURES;
+        cgh.set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
+            used_embree_features);
+      }
+#  else
+      (void)kernel_features;
+#  endif
      switch (device_kernel) {
        case DEVICE_KERNEL_INTEGRATOR_RESET: {
          oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset);
@@ -549,4 +664,5 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
 #  endif
  return success;
 }
+
 #endif /* WITH_ONEAPI */
--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@@ -47,10 +47,14 @@ CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
 CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
                                                       int kernel,
                                                       size_t global_size,
+                                                       const unsigned int kernel_features,
+                                                       bool use_hardware_raytracing,
                                                       void **args);
 CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue,
-                                                     const unsigned int requested_features);
+                                                     const unsigned int kernel_features,
+                                                     bool use_hardware_raytracing);
 #  ifdef __cplusplus
 }
+
 #  endif
 #endif /* WITH_ONEAPI */
--- a/intern/cycles/kernel/integrator/guiding.h
+++ b/intern/cycles/kernel/integrator/guiding.h
@@ -454,8 +454,13 @@ ccl_device_forceinline bool guiding_bsdf_init(KernelGlobals kg,
                                              ccl_private float &rand)
 {
 #if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+#  if OPENPGL_VERSION_MINOR >= 5
+  if (kg->opgl_surface_sampling_distribution->Init(
+          kg->opgl_guiding_field, guiding_point3f(P), rand)) {
+#  else
  if (kg->opgl_surface_sampling_distribution->Init(
          kg->opgl_guiding_field, guiding_point3f(P), rand, true)) {
+#  endif
    kg->opgl_surface_sampling_distribution->ApplyCosineProduct(guiding_point3f(N));
    return true;
  }
@@ -506,8 +511,13 @@ ccl_device_forceinline bool guiding_phase_init(KernelGlobals kg,
    return false;
  }

+#  if OPENPGL_VERSION_MINOR >= 5
+  if (kg->opgl_volume_sampling_distribution->Init(
+          kg->opgl_guiding_field, guiding_point3f(P), rand)) {
+#  else
  if (kg->opgl_volume_sampling_distribution->Init(
          kg->opgl_guiding_field, guiding_point3f(P), rand, true)) {
+#  endif
    kg->opgl_volume_sampling_distribution->ApplySingleLobeHenyeyGreensteinProduct(guiding_vec3f(D),
                                                                                  g);
    return true;
--- a/intern/cycles/kernel/light/area.h
+++ b/intern/cycles/kernel/light/area.h
@@ -342,7 +342,7 @@ ccl_device_forceinline void area_light_update_position(const ccl_global KernelLi
  ls->D = normalize_len(ls->P - P, &ls->t);
  ls->pdf = invarea;

-  if (klight->area.tan_half_spread > 0) {
+  if (klight->area.normalize_spread > 0) {
    ls->eval_fac = 0.25f * invarea;
    ls->eval_fac *= area_light_spread_attenuation(
        ls->D, ls->Ng, klight->area.tan_half_spread, klight->area.normalize_spread);
--- a/intern/cycles/kernel/light/distribution.h
+++ b/intern/cycles/kernel/light/distribution.h
@@ -56,7 +56,7 @@ ccl_device_noinline bool light_distribution_sample(KernelGlobals kg,
  const int index = light_distribution_sample(kg, randn);
  const float pdf_selection = kernel_data.integrator.distribution_pdf_lights;
  return light_sample<in_volume_segment>(
-      kg, randu, randv, time, P, bounce, path_flag, index, pdf_selection, ls);
+      kg, randu, randv, time, P, bounce, path_flag, index, 0, pdf_selection, ls);
 }

 ccl_device_inline float light_distribution_pdf_lamp(KernelGlobals kg)
--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -108,6 +108,7 @@ ccl_device_noinline bool light_sample(KernelGlobals kg,
                                      const int bounce,
                                      const uint32_t path_flag,
                                      const int emitter_index,
+                                      const int object_id,
                                      const float pdf_selection,
                                      ccl_private LightSample *ls)
 {
@@ -117,8 +118,9 @@ ccl_device_noinline bool light_sample(KernelGlobals kg,
  if (kernel_data.integrator.use_light_tree) {
    ccl_global const KernelLightTreeEmitter *kemitter = &kernel_data_fetch(light_tree_emitters,
                                                                           emitter_index);
-    prim = kemitter->prim_id;
-    mesh_light = kemitter->mesh_light;
+    prim = kemitter->light.id;
+    mesh_light.shader_flag = kemitter->mesh_light.shader_flag;
+    mesh_light.object_id = object_id;
  }
  else
 #endif
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -438,7 +438,9 @@ ccl_device_inline float light_sample_mis_weight_forward_surface(KernelGlobals kg
    const float3 N = INTEGRATOR_STATE(state, path, mis_origin_n);
    uint lookup_offset = kernel_data_fetch(object_lookup_offset, sd->object);
    uint prim_offset = kernel_data_fetch(object_prim_offset, sd->object);
-    pdf *= light_tree_pdf(kg, ray_P, N, path_flag, sd->prim - prim_offset + lookup_offset);
+    uint triangle = kernel_data_fetch(triangle_to_tree, sd->prim - prim_offset + lookup_offset);
+
+    pdf *= light_tree_pdf(kg, ray_P, N, path_flag, sd->object, triangle);
  }
  else
 #endif
@@ -462,7 +464,7 @@ ccl_device_inline float light_sample_mis_weight_forward_lamp(KernelGlobals kg,
 #ifdef __LIGHT_TREE__
  if (kernel_data.integrator.use_light_tree) {
    const float3 N = INTEGRATOR_STATE(state, path, mis_origin_n);
-    pdf *= light_tree_pdf(kg, P, N, path_flag, ~ls->lamp);
+    pdf *= light_tree_pdf(kg, P, N, path_flag, 0, kernel_data_fetch(light_to_tree, ls->lamp));
  }
  else
 #endif
@@ -496,7 +498,8 @@ ccl_device_inline float light_sample_mis_weight_forward_background(KernelGlobals
 #ifdef __LIGHT_TREE__
  if (kernel_data.integrator.use_light_tree) {
    const float3 N = INTEGRATOR_STATE(state, path, mis_origin_n);
-    pdf *= light_tree_pdf(kg, ray_P, N, path_flag, ~kernel_data.background.light_index);
+    uint light = kernel_data_fetch(light_to_tree, kernel_data.background.light_index);
+    pdf *= light_tree_pdf(kg, ray_P, N, path_flag, 0, light);
  }
  else
 #endif
--- a/intern/cycles/kernel/light/tree.h
+++ b/intern/cycles/kernel/light/tree.h
@@ -69,6 +69,59 @@ ccl_device float3 compute_v(
                                                    cos_phi0 * o0 + dot_o1_a * inv_len * o1;
 }

+ccl_device_inline bool is_light(const ccl_global KernelLightTreeEmitter *kemitter)
+{
+  return kemitter->light.id < 0;
+}
+
+ccl_device_inline bool is_mesh(const ccl_global KernelLightTreeEmitter *kemitter)
+{
+  return !is_light(kemitter) && kemitter->mesh_light.object_id == OBJECT_NONE;
+}
+
+ccl_device_inline bool is_triangle(const ccl_global KernelLightTreeEmitter *kemitter)
+{
+  return !is_light(kemitter) && kemitter->mesh_light.object_id != OBJECT_NONE;
+}
+
+ccl_device_inline bool is_leaf(const ccl_global KernelLightTreeNode *knode)
+{
+  /* The distant node is also considered o leaf node. */
+  return knode->type >= LIGHT_TREE_LEAF;
+}
+
+template<bool in_volume_segment>
+ccl_device void light_tree_to_local_space(KernelGlobals kg,
+                                          const int object_id,
+                                          ccl_private float3 &P,
+                                          ccl_private float3 &N_or_D,
+                                          ccl_private float &t)
+{
+  const int object_flag = kernel_data_fetch(object_flag, object_id);
+  if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+#ifdef __OBJECT_MOTION__
+    Transform itfm;
+    object_fetch_transform_motion_test(kg, object_id, 0.5f, &itfm);
+#else
+    const Transform itfm = object_fetch_transform(kg, object_id, OBJECT_INVERSE_TRANSFORM);
+#endif
+    P = transform_point(&itfm, P);
+    if (in_volume_segment) {
+      /* Transform direction. */
+      float3 D_local = transform_direction(&itfm, N_or_D);
+      float scale;
+      N_or_D = normalize_len(D_local, &scale);
+
+      t *= scale;
+    }
+    else if (!is_zero(N_or_D)) {
+      /* Transform normal. */
+      const Transform tfm = object_fetch_transform(kg, object_id, OBJECT_TRANSFORM);
+      N_or_D = normalize(transform_direction_transposed(&tfm, N_or_D));
+    }
+  }
+}
+
 /* This is the general function for calculating the importance of either a cluster or an emitter.
 * Both of the specialized functions obtain the necessary data before calling this function. */
 template<bool in_volume_segment>
@@ -184,9 +237,8 @@ ccl_device bool compute_emitter_centroid_and_dir(KernelGlobals kg,
                                                 ccl_private float3 &centroid,
                                                 ccl_private packed_float3 &dir)
 {
-  const int prim_id = kemitter->prim_id;
-  if (prim_id < 0) {
-    const ccl_global KernelLight *klight = &kernel_data_fetch(lights, ~prim_id);
+  if (is_light(kemitter)) {
+    const ccl_global KernelLight *klight = &kernel_data_fetch(lights, ~(kemitter->light.id));
    centroid = klight->co;

    switch (klight->type) {
@@ -213,19 +265,22 @@ ccl_device bool compute_emitter_centroid_and_dir(KernelGlobals kg,
    }
  }
  else {
+    kernel_assert(is_triangle(kemitter));
    const int object = kemitter->mesh_light.object_id;
    float3 vertices[3];
-    triangle_world_space_vertices(kg, object, prim_id, -1.0f, vertices);
+    triangle_vertices(kg, kemitter->triangle.id, vertices);
    centroid = (vertices[0] + vertices[1] + vertices[2]) / 3.0f;

-    const bool is_front_only = (kemitter->emission_sampling == EMISSION_SAMPLING_FRONT);
-    const bool is_back_only = (kemitter->emission_sampling == EMISSION_SAMPLING_BACK);
+    const bool is_front_only = (kemitter->triangle.emission_sampling == EMISSION_SAMPLING_FRONT);
+    const bool is_back_only = (kemitter->triangle.emission_sampling == EMISSION_SAMPLING_BACK);
    if (is_front_only || is_back_only) {
      dir = safe_normalize(cross(vertices[1] - vertices[0], vertices[2] - vertices[0]));
      if (is_back_only) {
        dir = -dir;
      }
-      if (kernel_data_fetch(object_flag, object) & SD_OBJECT_NEGATIVE_SCALE) {
+      const int object_flag = kernel_data_fetch(object_flag, object);
+      if ((object_flag & SD_OBJECT_TRANSFORM_APPLIED) &&
+          (object_flag & SD_OBJECT_NEGATIVE_SCALE)) {
        dir = -dir;
      }
    }
@@ -237,6 +292,75 @@ ccl_device bool compute_emitter_centroid_and_dir(KernelGlobals kg,
  return true;
 }

+template<bool in_volume_segment>
+ccl_device void light_tree_node_importance(KernelGlobals kg,
+                                           const float3 P,
+                                           const float3 N_or_D,
+                                           const float t,
+                                           const bool has_transmission,
+                                           const ccl_global KernelLightTreeNode *knode,
+                                           ccl_private float &max_importance,
+                                           ccl_private float &min_importance)
+{
+  const BoundingCone bcone = knode->bcone;
+  const BoundingBox bbox = knode->bbox;
+
+  float3 point_to_centroid;
+  float cos_theta_u;
+  float distance;
+  if (knode->type == LIGHT_TREE_DISTANT) {
+    if (in_volume_segment) {
+      return;
+    }
+    point_to_centroid = -bcone.axis;
+    cos_theta_u = fast_cosf(bcone.theta_o);
+    distance = 1.0f;
+  }
+  else {
+    const float3 centroid = 0.5f * (bbox.min + bbox.max);
+
+    if (in_volume_segment) {
+      const float3 D = N_or_D;
+      const float3 closest_point = P + dot(centroid - P, D) * D;
+      /* Minimal distance of the ray to the cluster. */
+      distance = len(centroid - closest_point);
+      point_to_centroid = -compute_v(centroid, P, D, bcone.axis, t);
+      cos_theta_u = light_tree_cos_bounding_box_angle(bbox, closest_point, point_to_centroid);
+    }
+    else {
+      const float3 N = N_or_D;
+      const float3 bbox_extent = bbox.max - centroid;
+      const bool bbox_is_visible = has_transmission |
+                                   (dot(N, centroid - P) + dot(fabs(N), fabs(bbox_extent)) > 0);
+
+      /* If the node is guaranteed to be behind the surface we're sampling, and the surface is
+       * opaque, then we can give the node an importance of 0 as it contributes nothing to the
+       * surface. */
+      if (!bbox_is_visible) {
+        return;
+      }
+
+      point_to_centroid = normalize_len(centroid - P, &distance);
+      cos_theta_u = light_tree_cos_bounding_box_angle(bbox, P, point_to_centroid);
+    }
+    /* Clamp distance to half the radius of the cluster when splitting is disabled. */
+    distance = fmaxf(0.5f * len(centroid - bbox.max), distance);
+  }
+  /* TODO: currently max_distance = min_distance, max_importance = min_importance for the
+   * nodes. Do we need better weights for complex scenes? */
+  light_tree_importance<in_volume_segment>(N_or_D,
+                                           has_transmission,
+                                           point_to_centroid,
+                                           cos_theta_u,
+                                           bcone,
+                                           distance,
+                                           distance,
+                                           t,
+                                           knode->energy,
+                                           max_importance,
+                                           min_importance);
+}
+
 template<bool in_volume_segment>
 ccl_device void light_tree_emitter_importance(KernelGlobals kg,
                                              const float3 P,
@@ -247,11 +371,21 @@ ccl_device void light_tree_emitter_importance(KernelGlobals kg,
                                              ccl_private float &max_importance,
                                              ccl_private float &min_importance)
 {
+  max_importance = 0.0f;
+  min_importance = 0.0f;
+
  const ccl_global KernelLightTreeEmitter *kemitter = &kernel_data_fetch(light_tree_emitters,
                                                                         emitter_index);

-  max_importance = 0.0f;
-  min_importance = 0.0f;
+  if (is_mesh(kemitter)) {
+    const ccl_global KernelLightTreeNode *knode = &kernel_data_fetch(light_tree_nodes,
+                                                                     kemitter->mesh.node_id);
+
+    light_tree_node_importance<in_volume_segment>(
+        kg, P, N_or_D, t, has_transmission, knode, max_importance, min_importance);
+    return;
+  }
+
  BoundingCone bcone;
  bcone.theta_o = kemitter->theta_o;
  bcone.theta_e = kemitter->theta_e;
@@ -264,8 +398,6 @@ ccl_device void light_tree_emitter_importance(KernelGlobals kg,
    return;
  }

-  const int prim_id = kemitter->prim_id;
-
  if (in_volume_segment) {
    const float3 D = N_or_D;
    /* Closest point. */
@@ -279,9 +411,15 @@ ccl_device void light_tree_emitter_importance(KernelGlobals kg,
    P_c = P;
  }

+  /* Early out if the emitter is guaranteed to be invisible. */
  bool is_visible;
-  if (prim_id < 0) {
-    const ccl_global KernelLight *klight = &kernel_data_fetch(lights, ~prim_id);
+  if (is_triangle(kemitter)) {
+    is_visible = triangle_light_tree_parameters<in_volume_segment>(
+        kg, kemitter, centroid, P_c, N_or_D, bcone, cos_theta_u, distance, point_to_centroid);
+  }
+  else {
+    kernel_assert(is_light(kemitter));
+    const ccl_global KernelLight *klight = &kernel_data_fetch(lights, ~(kemitter->light.id));
    switch (klight->type) {
      /* Function templates only modifies cos_theta_u when in_volume_segment = true. */
      case LIGHT_SPOT:
@@ -309,10 +447,6 @@ ccl_device void light_tree_emitter_importance(KernelGlobals kg,
        return;
    }
  }
-  else { /* Mesh light. */
-    is_visible = triangle_light_tree_parameters<in_volume_segment>(
-        kg, kemitter, centroid, P_c, N_or_D, bcone, cos_theta_u, distance, point_to_centroid);
-  }

  is_visible |= has_transmission;
  if (!is_visible) {
@@ -333,81 +467,31 @@ ccl_device void light_tree_emitter_importance(KernelGlobals kg,
 }

 template<bool in_volume_segment>
-ccl_device void light_tree_node_importance(KernelGlobals kg,
-                                           const float3 P,
-                                           const float3 N_or_D,
-                                           const float t,
-                                           const bool has_transmission,
-                                           const ccl_global KernelLightTreeNode *knode,
-                                           ccl_private float &max_importance,
-                                           ccl_private float &min_importance)
+ccl_device void light_tree_child_importance(KernelGlobals kg,
+                                            const float3 P,
+                                            const float3 N_or_D,
+                                            const float t,
+                                            const bool has_transmission,
+                                            const ccl_global KernelLightTreeNode *knode,
+                                            ccl_private float &max_importance,
+                                            ccl_private float &min_importance)
 {
  max_importance = 0.0f;
  min_importance = 0.0f;
+
  if (knode->num_emitters == 1) {
-    /* At a leaf node with only one emitter. */
-    light_tree_emitter_importance<in_volume_segment>(
-        kg, P, N_or_D, t, has_transmission, -knode->child_index, max_importance, min_importance);
+    light_tree_emitter_importance<in_volume_segment>(kg,
+                                                     P,
+                                                     N_or_D,
+                                                     t,
+                                                     has_transmission,
+                                                     knode->leaf.first_emitter,
+                                                     max_importance,
+                                                     min_importance);
  }
  else if (knode->num_emitters != 0) {
-    const BoundingCone bcone = knode->bcone;
-    const BoundingBox bbox = knode->bbox;
-
-    float3 point_to_centroid;
-    float cos_theta_u;
-    float distance;
-    if (knode->bit_trail == 1) {
-      /* Distant light node. */
-      if (in_volume_segment) {
-        return;
-      }
-      point_to_centroid = -bcone.axis;
-      cos_theta_u = fast_cosf(bcone.theta_o);
-      distance = 1.0f;
-    }
-    else {
-      const float3 centroid = 0.5f * (bbox.min + bbox.max);
-
-      if (in_volume_segment) {
-        const float3 D = N_or_D;
-        const float3 closest_point = P + dot(centroid - P, D) * D;
-        /* Minimal distance of the ray to the cluster. */
-        distance = len(centroid - closest_point);
-        point_to_centroid = -compute_v(centroid, P, D, bcone.axis, t);
-        cos_theta_u = light_tree_cos_bounding_box_angle(bbox, closest_point, point_to_centroid);
-      }
-      else {
-        const float3 N = N_or_D;
-        const float3 bbox_extent = bbox.max - centroid;
-        const bool bbox_is_visible = has_transmission |
-                                     (dot(N, centroid - P) + dot(fabs(N), fabs(bbox_extent)) > 0);
-
-        /* If the node is guaranteed to be behind the surface we're sampling, and the surface is
-         * opaque, then we can give the node an importance of 0 as it contributes nothing to the
-         * surface. */
-        if (!bbox_is_visible) {
-          return;
-        }
-
-        point_to_centroid = normalize_len(centroid - P, &distance);
-        cos_theta_u = light_tree_cos_bounding_box_angle(bbox, P, point_to_centroid);
-      }
-      /* Clamp distance to half the radius of the cluster when splitting is disabled. */
-      distance = fmaxf(0.5f * len(centroid - bbox.max), distance);
-    }
-    /* TODO: currently max_distance = min_distance, max_importance = min_importance for the
-     * nodes. Do we need better weights for complex scenes? */
-    light_tree_importance<in_volume_segment>(N_or_D,
-                                             has_transmission,
-                                             point_to_centroid,
-                                             cos_theta_u,
-                                             bcone,
-                                             distance,
-                                             distance,
-                                             t,
-                                             knode->energy,
-                                             max_importance,
-                                             min_importance);
+    light_tree_node_importance<in_volume_segment>(
+        kg, P, N_or_D, t, has_transmission, knode, max_importance, min_importance);
  }
 }

@@ -440,26 +524,30 @@ ccl_device void sample_resevoir(const int current_index,
 template<bool in_volume_segment>
 ccl_device int light_tree_cluster_select_emitter(KernelGlobals kg,
                                                 ccl_private float &rand,
-                                                 const float3 P,
-                                                 const float3 N_or_D,
-                                                 const float t,
+                                                 ccl_private float3 &P,
+                                                 ccl_private float3 &N_or_D,
+                                                 ccl_private float &t,
                                                 const bool has_transmission,
-                                                 const ccl_global KernelLightTreeNode *knode,
+                                                 ccl_private int *node_index,
                                                 ccl_private float *pdf_factor)
 {
  float selected_importance[2] = {0.0f, 0.0f};
  float total_importance[2] = {0.0f, 0.0f};
  int selected_index = -1;
+  const ccl_global KernelLightTreeNode *knode = &kernel_data_fetch(light_tree_nodes, *node_index);
+  *node_index = -1;

  /* Mark emitters with zero importance. Used for resevoir when total minimum importance = 0. */
  kernel_assert(knode->num_emitters <= sizeof(uint) * 8);
  uint has_importance = 0;

  const bool sample_max = (rand > 0.5f); /* Sampling using the maximum importance. */
-  rand = rand * 2.0f - float(sample_max);
+  if (knode->num_emitters > 1) {
+    rand = rand * 2.0f - float(sample_max);
+  }

  for (int i = 0; i < knode->num_emitters; i++) {
-    int current_index = -knode->child_index + i;
+    int current_index = knode->leaf.first_emitter + i;
    /* maximum importance = importance[0], minimum importance = importance[1] */
    float importance[2];
    light_tree_emitter_importance<in_volume_segment>(
@@ -492,7 +580,7 @@ ccl_device int light_tree_cluster_select_emitter(KernelGlobals kg,
    else {
      selected_index = -1;
      for (int i = 0; i < knode->num_emitters; i++) {
-        int current_index = -knode->child_index + i;
+        int current_index = knode->inner.right_child + i;
        sample_resevoir(current_index,
                        float(has_importance & 1),
                        selected_index,
@@ -508,8 +596,24 @@ ccl_device int light_tree_cluster_select_emitter(KernelGlobals kg,
    }
  }

-  *pdf_factor = 0.5f * (selected_importance[0] / total_importance[0] +
-                        selected_importance[1] / total_importance[1]);
+  *pdf_factor *= 0.5f * (selected_importance[0] / total_importance[0] +
+                         selected_importance[1] / total_importance[1]);
+
+  const ccl_global KernelLightTreeEmitter *kemitter = &kernel_data_fetch(light_tree_emitters,
+                                                                         selected_index);
+
+  if (is_mesh(kemitter)) {
+    /* Transform ray from world to local space. */
+    light_tree_to_local_space<in_volume_segment>(kg, kemitter->mesh.object_id, P, N_or_D, t);
+
+    *node_index = kemitter->mesh.node_id;
+    const ccl_global KernelLightTreeNode *knode = &kernel_data_fetch(light_tree_nodes,
+                                                                     *node_index);
+    if (knode->type == LIGHT_TREE_INSTANCE) {
+      /* Switch to the node with the subtree. */
+      *node_index = knode->instance.reference;
+    }
+  }

  return selected_index;
 }
@@ -528,9 +632,9 @@ ccl_device bool get_left_probability(KernelGlobals kg,
  const ccl_global KernelLightTreeNode *right = &kernel_data_fetch(light_tree_nodes, right_index);

  float min_left_importance, max_left_importance, min_right_importance, max_right_importance;
-  light_tree_node_importance<in_volume_segment>(
+  light_tree_child_importance<in_volume_segment>(
      kg, P, N_or_D, t, has_transmission, left, max_left_importance, min_left_importance);
-  light_tree_node_importance<in_volume_segment>(
+  light_tree_child_importance<in_volume_segment>(
      kg, P, N_or_D, t, has_transmission, right, max_right_importance, min_right_importance);

  const float total_max_importance = max_left_importance + max_right_importance;
@@ -556,8 +660,8 @@ ccl_device_noinline bool light_tree_sample(KernelGlobals kg,
                                           const float randv,
                                           const float time,
                                           const float3 P,
-                                           const float3 N_or_D,
-                                           const float t,
+                                           float3 N_or_D,
+                                           float t,
                                           const int shader_flags,
                                           const int bounce,
                                           const uint32_t path_flag,
@@ -571,28 +675,38 @@ ccl_device_noinline bool light_tree_sample(KernelGlobals kg,
  float pdf_leaf = 1.0f;
  float pdf_selection = 1.0f;
  int selected_emitter = -1;
-
+  int object = 0;
  int node_index = 0; /* Root node. */

+  float3 local_P = P;
+
  /* Traverse the light tree until a leaf node is reached. */
  while (true) {
    const ccl_global KernelLightTreeNode *knode = &kernel_data_fetch(light_tree_nodes, node_index);

-    if (knode->child_index <= 0) {
+    if (is_leaf(knode)) {
      /* At a leaf node, we pick an emitter. */
      selected_emitter = light_tree_cluster_select_emitter<in_volume_segment>(
-          kg, randn, P, N_or_D, t, has_transmission, knode, &pdf_selection);
-      break;
+          kg, randn, local_P, N_or_D, t, has_transmission, &node_index, &pdf_selection);
+
+      if (node_index < 0) {
+        break;
+      }
+      else {
+        /* Continue with the picked mesh light. */
+        object = kernel_data_fetch(light_tree_emitters, selected_emitter).mesh.object_id;
+        continue;
+      }
    }

    /* At an interior node, the left child is directly after the parent, while the right child is
     * stored as the child index. */
    const int left_index = node_index + 1;
-    const int right_index = knode->child_index;
+    const int right_index = knode->inner.right_child;

    float left_prob;
    if (!get_left_probability<in_volume_segment>(
-            kg, P, N_or_D, t, has_transmission, left_index, right_index, left_prob)) {
+            kg, local_P, N_or_D, t, has_transmission, left_index, right_index, left_prob)) {
      return false; /* Both child nodes have zero importance. */
    }

@@ -610,38 +724,104 @@ ccl_device_noinline bool light_tree_sample(KernelGlobals kg,
  pdf_selection *= pdf_leaf;

  return light_sample<in_volume_segment>(
-      kg, randu, randv, time, P, bounce, path_flag, selected_emitter, pdf_selection, ls);
+      kg, randu, randv, time, P, bounce, path_flag, selected_emitter, object, pdf_selection, ls);
 }

 /* We need to be able to find the probability of selecting a given light for MIS. */
 ccl_device float light_tree_pdf(
-    KernelGlobals kg, const float3 P, const float3 N, const int path_flag, const int emitter)
+    KernelGlobals kg, float3 P, float3 N, const int path_flag, const int object, const uint target)
 {
  const bool has_transmission = (path_flag & PATH_RAY_MIS_HAD_TRANSMISSION);
-  /* Target emitter info. */
-  const int target_emitter = (emitter >= 0) ? kernel_data_fetch(triangle_to_tree, emitter) :
-                                              kernel_data_fetch(light_to_tree, ~emitter);
-  ccl_global const KernelLightTreeEmitter *kemitter = &kernel_data_fetch(light_tree_emitters,
-                                                                         target_emitter);
-  const int target_leaf = kemitter->parent_index;
-  ccl_global const KernelLightTreeNode *kleaf = &kernel_data_fetch(light_tree_nodes, target_leaf);
-  uint bit_trail = kleaf->bit_trail;

-  int node_index = 0; /* Root node. */
+  ccl_global const KernelLightTreeEmitter *kemitter = &kernel_data_fetch(light_tree_emitters,
+                                                                         target);
+  int root_index, target_leaf;
+  uint bit_trail, target_emitter;
+
+  if (is_triangle(kemitter)) {
+    /* If the target is an emissive triangle, first traverse the top level tree to find the mesh
+     * light emitter, then traverse the subtree. */
+    target_emitter = kernel_data_fetch(object_to_tree, object);
+    ccl_global const KernelLightTreeEmitter *kmesh = &kernel_data_fetch(light_tree_emitters,
+                                                                        target_emitter);
+    target_leaf = kmesh->parent_index;
+    root_index = kmesh->mesh.node_id;
+    ccl_global const KernelLightTreeNode *kroot = &kernel_data_fetch(light_tree_nodes, root_index);
+    bit_trail = kroot->bit_trail;
+
+    if (kroot->type == LIGHT_TREE_INSTANCE) {
+      root_index = kroot->instance.reference;
+    }
+  }
+  else {
+    root_index = 0;
+    target_leaf = kemitter->parent_index;
+    bit_trail = kernel_data_fetch(light_tree_nodes, target_leaf).bit_trail;
+    target_emitter = target;
+  }

  float pdf = 1.0f;
+  int node_index = 0;

  /* Traverse the light tree until we reach the target leaf node. */
  while (true) {
    const ccl_global KernelLightTreeNode *knode = &kernel_data_fetch(light_tree_nodes, node_index);

-    if (knode->child_index <= 0) {
-      break;
+    if (is_leaf(knode)) {
+      kernel_assert(node_index == target_leaf);
+      ccl_global const KernelLightTreeNode *kleaf = &kernel_data_fetch(light_tree_nodes,
+                                                                       target_leaf);
+
+      /* Iterate through leaf node to find the probability of sampling the target emitter. */
+      float target_max_importance = 0.0f;
+      float target_min_importance = 0.0f;
+      float total_max_importance = 0.0f;
+      float total_min_importance = 0.0f;
+      int num_has_importance = 0;
+      for (int i = 0; i < kleaf->num_emitters; i++) {
+        const int emitter = kleaf->leaf.first_emitter + i;
+        float max_importance, min_importance;
+        light_tree_emitter_importance<false>(
+            kg, P, N, 0, has_transmission, emitter, max_importance, min_importance);
+        num_has_importance += (max_importance > 0);
+        if (emitter == target_emitter) {
+          target_max_importance = max_importance;
+          target_min_importance = min_importance;
+        }
+        total_max_importance += max_importance;
+        total_min_importance += min_importance;
+      }
+
+      if (target_max_importance > 0.0f) {
+        pdf *= 0.5f * (target_max_importance / total_max_importance +
+                       (total_min_importance > 0 ? target_min_importance / total_min_importance :
+                                                   1.0f / num_has_importance));
+      }
+      else {
+        return 0.0f;
+      }
+
+      if (root_index) {
+        /* Arrived at the mesh light. Continue with the subtree. */
+        float unused;
+        light_tree_to_local_space<false>(kg, object, P, N, unused);
+
+        node_index = root_index;
+        root_index = 0;
+        target_emitter = target;
+        target_leaf = kemitter->parent_index;
+        bit_trail = kernel_data_fetch(light_tree_nodes, target_leaf).bit_trail;
+        continue;
+      }
+      else {
+        kernel_assert(node_index == target_leaf);
+        return pdf;
+      }
    }

    /* Interior node. */
    const int left_index = node_index + 1;
-    const int right_index = knode->child_index;
+    const int right_index = knode->inner.right_child;

    float left_prob;
    if (!get_left_probability<false>(
@@ -658,36 +838,6 @@ ccl_device float light_tree_pdf(
      return 0.0f;
    }
  }
-
-  kernel_assert(node_index == target_leaf);
-
-  /* Iterate through leaf node to find the probability of sampling the target emitter. */
-  float target_max_importance = 0.0f;
-  float target_min_importance = 0.0f;
-  float total_max_importance = 0.0f;
-  float total_min_importance = 0.0f;
-  int num_has_importance = 0;
-  for (int i = 0; i < kleaf->num_emitters; i++) {
-    const int emitter = -kleaf->child_index + i;
-    float max_importance, min_importance;
-    light_tree_emitter_importance<false>(
-        kg, P, N, 0, has_transmission, emitter, max_importance, min_importance);
-    num_has_importance += (max_importance > 0);
-    if (emitter == target_emitter) {
-      target_max_importance = max_importance;
-      target_min_importance = min_importance;
-    }
-    total_max_importance += max_importance;
-    total_min_importance += min_importance;
-  }
-
-  if (target_max_importance > 0.0f) {
-    return pdf * 0.5f *
-           (target_max_importance / total_max_importance +
-            (total_min_importance > 0 ? target_min_importance / total_min_importance :
-                                        1.0f / num_has_importance));
-  }
-  return 0.0f;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/light/triangle.h
+++ b/intern/cycles/kernel/light/triangle.h
@@ -304,9 +304,8 @@ ccl_device_forceinline bool triangle_light_tree_parameters(

  cos_theta_u = FLT_MAX;

-  const int object = kemitter->mesh_light.object_id;
  float3 vertices[3];
-  triangle_world_space_vertices(kg, object, kemitter->prim_id, -1.0f, vertices);
+  triangle_vertices(kg, kemitter->triangle.id, vertices);

  bool shape_above_surface = false;
  for (int i = 0; i < 3; i++) {
--- a/intern/cycles/kernel/osl/services_gpu.h
+++ b/intern/cycles/kernel/osl/services_gpu.h
@@ -1390,19 +1390,128 @@ ccl_device_extern void osl_noiseparams_set_impulses(ccl_private OSLNoiseOptions
    res->y = n; \
    res->z = n; \
  } \
-  ccl_device_extern void name##_vv(ccl_private float3 *res, const float3 *v) \
+  ccl_device_extern void name##_vv(ccl_private float3 *res, ccl_private const float3 *v) \
  { \
    const float n = name##_fv(v); \
    res->x = n; \
    res->y = n; \
    res->z = n; \
  } \
-  ccl_device_extern void name##_vvf(ccl_private float3 *res, const float3 *v, float w) \
+  ccl_device_extern void name##_vvf( \
+      ccl_private float3 *res, ccl_private const float3 *v, float w) \
  { \
    const float n = name##_fvf(v, w); \
    res->x = n; \
    res->y = n; \
    res->z = n; \
+  } \
+  ccl_device_extern void name##_dfdf(ccl_private float *res, ccl_private const float *x) \
+  { \
+    res[0] = name##_ff(x[0]); \
+    res[1] = name##_ff(x[1]); \
+    res[2] = name##_ff(x[2]); \
+  } \
+  ccl_device_extern void name##_dfdff( \
+      ccl_private float *res, ccl_private const float *x, float y) \
+  { \
+    res[0] = name##_fff(x[0], y); \
+    res[1] = name##_fff(x[1], y); \
+    res[2] = name##_fff(x[2], y); \
+  } \
+  ccl_device_extern void name##_dffdf( \
+      ccl_private float *res, float x, ccl_private const float *y) \
+  { \
+    res[0] = name##_fff(x, y[0]); \
+    res[1] = name##_fff(x, y[1]); \
+    res[2] = name##_fff(x, y[2]); \
+  } \
+  ccl_device_extern void name##_dfdfdf( \
+      ccl_private float *res, ccl_private const float *x, ccl_private const float *y) \
+  { \
+    res[0] = name##_fff(x[0], y[0]); \
+    res[1] = name##_fff(x[1], y[1]); \
+    res[2] = name##_fff(x[2], y[2]); \
+  } \
+  ccl_device_extern void name##_dfdv(ccl_private float *res, ccl_private const float3 *v) \
+  { \
+    res[0] = name##_fv(&v[0]); \
+    res[1] = name##_fv(&v[1]); \
+    res[2] = name##_fv(&v[2]); \
+  } \
+  ccl_device_extern void name##_dfdvf( \
+      ccl_private float *res, ccl_private const float3 *v, float w) \
+  { \
+    res[0] = name##_fvf(&v[0], w); \
+    res[1] = name##_fvf(&v[1], w); \
+    res[2] = name##_fvf(&v[2], w); \
+  } \
+  ccl_device_extern void name##_dfvdf( \
+      ccl_private float *res, ccl_private const float3 *v, ccl_private const float *w) \
+  { \
+    res[0] = name##_fvf(v, w[0]); \
+    res[1] = name##_fvf(v, w[1]); \
+    res[2] = name##_fvf(v, w[2]); \
+  } \
+  ccl_device_extern void name##_dfdvdf( \
+      ccl_private float *res, ccl_private const float3 *v, ccl_private const float *w) \
+  { \
+    res[0] = name##_fvf(&v[0], w[0]); \
+    res[1] = name##_fvf(&v[1], w[1]); \
+    res[2] = name##_fvf(&v[2], w[2]); \
+  } \
+  ccl_device_extern void name##_dvdf(ccl_private float3 *res, ccl_private const float *x) \
+  { \
+    name##_vf(&res[0], x[0]); \
+    name##_vf(&res[1], x[1]); \
+    name##_vf(&res[2], x[2]); \
+  } \
+  ccl_device_extern void name##_dvdff( \
+      ccl_private float3 *res, ccl_private const float *x, float y) \
+  { \
+    name##_vff(&res[0], x[0], y); \
+    name##_vff(&res[1], x[1], y); \
+    name##_vff(&res[2], x[2], y); \
+  } \
+  ccl_device_extern void name##_dvfdf( \
+      ccl_private float3 *res, float x, ccl_private const float *y) \
+  { \
+    name##_vff(&res[0], x, y[0]); \
+    name##_vff(&res[1], x, y[1]); \
+    name##_vff(&res[2], x, y[2]); \
+  } \
+  ccl_device_extern void name##_dvdfdf( \
+      ccl_private float3 *res, ccl_private const float *x, ccl_private const float *y) \
+  { \
+    name##_vff(&res[0], x[0], y[0]); \
+    name##_vff(&res[1], x[1], y[1]); \
+    name##_vff(&res[2], x[2], y[2]); \
+  } \
+  ccl_device_extern void name##_dvdv(ccl_private float3 *res, ccl_private const float3 *v) \
+  { \
+    name##_vv(&res[0], &v[0]); \
+    name##_vv(&res[1], &v[1]); \
+    name##_vv(&res[2], &v[2]); \
+  } \
+  ccl_device_extern void name##_dvdvf( \
+      ccl_private float3 *res, ccl_private const float3 *v, float w) \
+  { \
+    name##_vvf(&res[0], &v[0], w); \
+    name##_vvf(&res[1], &v[1], w); \
+    name##_vvf(&res[2], &v[2], w); \
+  } \
+  ccl_device_extern void name##_dvvdf( \
+      ccl_private float3 *res, ccl_private const float3 *v, ccl_private const float *w) \
+  { \
+    name##_vvf(&res[0], v, w[0]); \
+    name##_vvf(&res[1], v, w[1]); \
+    name##_vvf(&res[2], v, w[2]); \
+  } \
+  ccl_device_extern void name##_dvdvdf( \
+      ccl_private float3 *res, ccl_private const float3 *v, ccl_private const float *w) \
+  { \
+    name##_vvf(&res[0], &v[0], w[0]); \
+    name##_vvf(&res[1], &v[1], w[1]); \
+    name##_vvf(&res[2], &v[2], w[2]); \
  }

 ccl_device_forceinline float hashnoise_1d(float p)
--- a/intern/cycles/kernel/osl/shaders/node_sky_texture.osl
+++ b/intern/cycles/kernel/osl/shaders/node_sky_texture.osl
@@ -132,11 +132,11 @@ color sky_radiance_nishita(vector dir, float nishita_data[10], string filename)
    /* definitions */
    vector sun_dir = geographical_to_direction(sun_elevation, sun_rotation + M_PI_2);
    float sun_dir_angle = precise_angle(dir, sun_dir);
-    float half_angular = angular_diameter / 2.0;
+    float half_angular = angular_diameter * 0.5;
    float dir_elevation = M_PI_2 - direction[0];

-    /* if ray inside sun disc render it, otherwise render sky.
-     * alternatively, ignore the sun if we're evaluating the background texture. */
+    /* If the ray is inside the sun disc, render it, otherwise render the sky.
+     * Alternatively, ignore the sun if we're evaluating the background texture. */
    if (sun_dir_angle < half_angular && sun_disc == 1 && raytype("importance_bake") != 1) {
      /* get 2 pixels data */
      color pixel_bottom = color(nishita_data[0], nishita_data[1], nishita_data[2]);
--- a/intern/cycles/kernel/sample/mapping.h
+++ b/intern/cycles/kernel/sample/mapping.h
@@ -84,8 +84,8 @@ ccl_device_inline void sample_uniform_cone(const float3 N,
 ccl_device_inline float pdf_uniform_cone(const float3 N, float3 D, float angle)
 {
  float zMin = cosf(angle);
-  float z = dot(N, D);
-  if (z > zMin) {
+  float z = precise_angle(N, D);
+  if (z < angle) {
    return M_1_2PI_F / (1.0f - zMin);
  }
  return 0.0f;
--- a/intern/cycles/kernel/svm/sky.h
+++ b/intern/cycles/kernel/svm/sky.h
@@ -138,12 +138,13 @@ ccl_device float3 sky_radiance_nishita(KernelGlobals kg,
    /* definitions */
    float3 sun_dir = geographical_to_direction(sun_elevation, sun_rotation + M_PI_2_F);
    float sun_dir_angle = precise_angle(dir, sun_dir);
-    float half_angular = angular_diameter / 2.0f;
+    float half_angular = angular_diameter * 0.5f;
    float dir_elevation = M_PI_2_F - direction.x;

-    /* if ray inside sun disc render it, otherwise render sky.
-     * alternatively, ignore the sun if we're evaluating the background texture. */
-    if (sun_disc && sun_dir_angle < half_angular && !(path_flag & PATH_RAY_IMPORTANCE_BAKE)) {
+    /* If the ray is inside the sun disc, render it, otherwise render the sky.
+     * Alternatively, ignore the sun if we're evaluating the background texture. */
+    if (sun_disc && sun_dir_angle < half_angular &&
+        !((path_flag & PATH_RAY_IMPORTANCE_BAKE) && kernel_data.background.use_sun_guiding)) {
      /* get 2 pixels data */
      float y;

--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -3,8 +3,9 @@

 #pragma once

-#if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE)
-#  if EMBREE_MAJOR_VERSION >= 4
+#if (!defined(__KERNEL_GPU__) || (defined(__KERNEL_ONEAPI__) && defined(WITH_EMBREE_GPU))) && \
+    defined(WITH_EMBREE)
+#  if EMBREE_MAJOR_VERSION == 4
 #    include <embree4/rtcore.h>
 #    include <embree4/rtcore_scene.h>
 #  else
@@ -78,9 +79,8 @@ CCL_NAMESPACE_BEGIN
 #define __VISIBILITY_FLAG__
 #define __VOLUME__

-/* TODO: solve internal compiler errors and enable light tree on HIP. */
 /* TODO: solve internal compiler perf issue and enable light tree on Metal/AMD. */
-#if defined(__KERNEL_HIP__) || defined(__KERNEL_METAL_AMD__)
+#if defined(__KERNEL_METAL_AMD__)
 #  undef __LIGHT_TREE__
 #endif

@@ -1370,6 +1370,13 @@ using BoundingCone = struct BoundingCone {
  float theta_e;
 };

+enum LightTreeNodeType : uint8_t {
+  LIGHT_TREE_INSTANCE = (1 << 0),
+  LIGHT_TREE_INNER = (1 << 1),
+  LIGHT_TREE_LEAF = (1 << 2),
+  LIGHT_TREE_DISTANT = (1 << 3),
+};
+
 typedef struct KernelLightTreeNode {
  /* Bounding box. */
  BoundingBox bbox;
@@ -1380,17 +1387,25 @@ typedef struct KernelLightTreeNode {
  /* Energy. */
  float energy;

-  /* If this is 0 or less, we're at a leaf node
-   * and the negative value indexes into the first child of the light array.
-   * Otherwise, it's an index to the node's second child. */
-  int child_index;
-  int num_emitters; /* leaf nodes need to know the number of emitters stored. */
+  LightTreeNodeType type;
+
+  /* Leaf nodes need to know the number of emitters stored. */
+  int num_emitters;
+
+  union {
+    struct {
+      int first_emitter; /* The index of the first emitter. */
+    } leaf;
+    struct {
+      int right_child; /* The index of the right child. */
+    } inner;
+    struct {
+      int reference; /* A reference to the node with the subtree. */
+    } instance;
+  };

  /* Bit trail. */
  uint bit_trail;
-
-  /* Padding. */
-  int pad;
 } KernelLightTreeNode;
 static_assert_align(KernelLightTreeNode, 16);

@@ -1402,10 +1417,23 @@ typedef struct KernelLightTreeEmitter {
  /* Energy. */
  float energy;

-  /* The location in the lights or triangles array. */
-  int prim_id;
+  union {
+    struct {
+      int id; /* The location in the triangles array. */
+      EmissionSampling emission_sampling;
+    } triangle;
+
+    struct {
+      int id; /* The location in the lights array. */
+    } light;
+
+    struct {
+      int object_id;
+      int node_id;
+    } mesh;
+  };
+
  MeshLight mesh_light;
-  EmissionSampling emission_sampling;

  /* Parent. */
  int parent_index;
--- a/intern/cycles/scene/CMakeLists.txt
+++ b/intern/cycles/scene/CMakeLists.txt
@@ -15,8 +15,12 @@ set(SRC
  camera.cpp
  colorspace.cpp
  constant_fold.cpp
+  devicescene.cpp
  film.cpp
  geometry.cpp
+  geometry_attributes.cpp
+  geometry_bvh.cpp
+  geometry_mesh.cpp
  hair.cpp
  image.cpp
  image_oiio.cpp
@@ -55,6 +59,7 @@ set(SRC_HEADERS
  camera.h
  colorspace.h
  constant_fold.h
+  devicescene.h
  film.h
  geometry.h
  hair.h
--- a/intern/cycles/scene/devicescene.cpp
+++ b/intern/cycles/scene/devicescene.cpp
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "scene/devicescene.h"
+#include "device/device.h"
+#include "device/memory.h"
+
+CCL_NAMESPACE_BEGIN
+
+DeviceScene::DeviceScene(Device *device)
+    : bvh_nodes(device, "bvh_nodes", MEM_GLOBAL),
+      bvh_leaf_nodes(device, "bvh_leaf_nodes", MEM_GLOBAL),
+      object_node(device, "object_node", MEM_GLOBAL),
+      prim_type(device, "prim_type", MEM_GLOBAL),
+      prim_visibility(device, "prim_visibility", MEM_GLOBAL),
+      prim_index(device, "prim_index", MEM_GLOBAL),
+      prim_object(device, "prim_object", MEM_GLOBAL),
+      prim_time(device, "prim_time", MEM_GLOBAL),
+      tri_verts(device, "tri_verts", MEM_GLOBAL),
+      tri_shader(device, "tri_shader", MEM_GLOBAL),
+      tri_vnormal(device, "tri_vnormal", MEM_GLOBAL),
+      tri_vindex(device, "tri_vindex", MEM_GLOBAL),
+      tri_patch(device, "tri_patch", MEM_GLOBAL),
+      tri_patch_uv(device, "tri_patch_uv", MEM_GLOBAL),
+      curves(device, "curves", MEM_GLOBAL),
+      curve_keys(device, "curve_keys", MEM_GLOBAL),
+      curve_segments(device, "curve_segments", MEM_GLOBAL),
+      patches(device, "patches", MEM_GLOBAL),
+      points(device, "points", MEM_GLOBAL),
+      points_shader(device, "points_shader", MEM_GLOBAL),
+      objects(device, "objects", MEM_GLOBAL),
+      object_motion_pass(device, "object_motion_pass", MEM_GLOBAL),
+      object_motion(device, "object_motion", MEM_GLOBAL),
+      object_flag(device, "object_flag", MEM_GLOBAL),
+      object_volume_step(device, "object_volume_step", MEM_GLOBAL),
+      object_prim_offset(device, "object_prim_offset", MEM_GLOBAL),
+      camera_motion(device, "camera_motion", MEM_GLOBAL),
+      attributes_map(device, "attributes_map", MEM_GLOBAL),
+      attributes_float(device, "attributes_float", MEM_GLOBAL),
+      attributes_float2(device, "attributes_float2", MEM_GLOBAL),
+      attributes_float3(device, "attributes_float3", MEM_GLOBAL),
+      attributes_float4(device, "attributes_float4", MEM_GLOBAL),
+      attributes_uchar4(device, "attributes_uchar4", MEM_GLOBAL),
+      light_distribution(device, "light_distribution", MEM_GLOBAL),
+      lights(device, "lights", MEM_GLOBAL),
+      light_background_marginal_cdf(device, "light_background_marginal_cdf", MEM_GLOBAL),
+      light_background_conditional_cdf(device, "light_background_conditional_cdf", MEM_GLOBAL),
+      light_tree_nodes(device, "light_tree_nodes", MEM_GLOBAL),
+      light_tree_emitters(device, "light_tree_emitters", MEM_GLOBAL),
+      light_to_tree(device, "light_to_tree", MEM_GLOBAL),
+      object_to_tree(device, "object_to_tree", MEM_GLOBAL),
+      object_lookup_offset(device, "object_lookup_offset", MEM_GLOBAL),
+      triangle_to_tree(device, "triangle_to_tree", MEM_GLOBAL),
+      particles(device, "particles", MEM_GLOBAL),
+      svm_nodes(device, "svm_nodes", MEM_GLOBAL),
+      shaders(device, "shaders", MEM_GLOBAL),
+      lookup_table(device, "lookup_table", MEM_GLOBAL),
+      sample_pattern_lut(device, "sample_pattern_lut", MEM_GLOBAL),
+      ies_lights(device, "ies", MEM_GLOBAL)
+{
+  memset((void *)&data, 0, sizeof(data));
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/scene/devicescene.h
+++ b/intern/cycles/scene/devicescene.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef __DEVICESCENE_H__
+#define __DEVICESCENE_H__
+
+#include "device/device.h"
+#include "device/memory.h"
+
+#include "util/types.h"
+#include "util/vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceScene {
+ public:
+  /* BVH */
+  device_vector<int4> bvh_nodes;
+  device_vector<int4> bvh_leaf_nodes;
+  device_vector<int> object_node;
+  device_vector<int> prim_type;
+  device_vector<uint> prim_visibility;
+  device_vector<int> prim_index;
+  device_vector<int> prim_object;
+  device_vector<float2> prim_time;
+
+  /* mesh */
+  device_vector<packed_float3> tri_verts;
+  device_vector<uint> tri_shader;
+  device_vector<packed_float3> tri_vnormal;
+  device_vector<packed_uint3> tri_vindex;
+  device_vector<uint> tri_patch;
+  device_vector<float2> tri_patch_uv;
+
+  device_vector<KernelCurve> curves;
+  device_vector<float4> curve_keys;
+  device_vector<KernelCurveSegment> curve_segments;
+
+  device_vector<uint> patches;
+
+  /* point-cloud */
+  device_vector<float4> points;
+  device_vector<uint> points_shader;
+
+  /* objects */
+  device_vector<KernelObject> objects;
+  device_vector<Transform> object_motion_pass;
+  device_vector<DecomposedTransform> object_motion;
+  device_vector<uint> object_flag;
+  device_vector<float> object_volume_step;
+  device_vector<uint> object_prim_offset;
+
+  /* cameras */
+  device_vector<DecomposedTransform> camera_motion;
+
+  /* attributes */
+  device_vector<AttributeMap> attributes_map;
+  device_vector<float> attributes_float;
+  device_vector<float2> attributes_float2;
+  device_vector<packed_float3> attributes_float3;
+  device_vector<float4> attributes_float4;
+  device_vector<uchar4> attributes_uchar4;
+
+  /* lights */
+  device_vector<KernelLightDistribution> light_distribution;
+  device_vector<KernelLight> lights;
+  device_vector<float2> light_background_marginal_cdf;
+  device_vector<float2> light_background_conditional_cdf;
+
+  /* light tree */
+  device_vector<KernelLightTreeNode> light_tree_nodes;
+  device_vector<KernelLightTreeEmitter> light_tree_emitters;
+  device_vector<uint> light_to_tree;
+  device_vector<uint> object_to_tree;
+  device_vector<uint> object_lookup_offset;
+  device_vector<uint> triangle_to_tree;
+
+  /* particles */
+  device_vector<KernelParticle> particles;
+
+  /* shaders */
+  device_vector<int4> svm_nodes;
+  device_vector<KernelShader> shaders;
+
+  /* lookup tables */
+  device_vector<float> lookup_table;
+
+  /* integrator */
+  device_vector<float> sample_pattern_lut;
+
+  /* IES lights */
+  device_vector<float> ies_lights;
+
+  KernelData data;
+
+  DeviceScene(Device *device);
+};
+
+CCL_NAMESPACE_END
+
+#endif /*  __DEVICESCENE_H__ */
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
--- a/intern/cycles/scene/geometry.h
+++ b/intern/cycles/scene/geometry.h
@@ -30,6 +30,38 @@ class Shader;
 class Volume;
 struct PackedBVH;

+/* Set of flags used to help determining what data has been modified or needs reallocation, so we
+ * can decide which device data to free or update. */
+enum {
+  DEVICE_CURVE_DATA_MODIFIED = (1 << 0),
+  DEVICE_MESH_DATA_MODIFIED = (1 << 1),
+  DEVICE_POINT_DATA_MODIFIED = (1 << 2),
+
+  ATTR_FLOAT_MODIFIED = (1 << 3),
+  ATTR_FLOAT2_MODIFIED = (1 << 4),
+  ATTR_FLOAT3_MODIFIED = (1 << 5),
+  ATTR_FLOAT4_MODIFIED = (1 << 6),
+  ATTR_UCHAR4_MODIFIED = (1 << 7),
+
+  CURVE_DATA_NEED_REALLOC = (1 << 8),
+  MESH_DATA_NEED_REALLOC = (1 << 9),
+  POINT_DATA_NEED_REALLOC = (1 << 10),
+
+  ATTR_FLOAT_NEEDS_REALLOC = (1 << 11),
+  ATTR_FLOAT2_NEEDS_REALLOC = (1 << 12),
+  ATTR_FLOAT3_NEEDS_REALLOC = (1 << 13),
+  ATTR_FLOAT4_NEEDS_REALLOC = (1 << 14),
+
+  ATTR_UCHAR4_NEEDS_REALLOC = (1 << 15),
+
+  ATTRS_NEED_REALLOC = (ATTR_FLOAT_NEEDS_REALLOC | ATTR_FLOAT2_NEEDS_REALLOC |
+                        ATTR_FLOAT3_NEEDS_REALLOC | ATTR_FLOAT4_NEEDS_REALLOC |
+                        ATTR_UCHAR4_NEEDS_REALLOC),
+  DEVICE_MESH_DATA_NEEDS_REALLOC = (MESH_DATA_NEED_REALLOC | ATTRS_NEED_REALLOC),
+  DEVICE_POINT_DATA_NEEDS_REALLOC = (POINT_DATA_NEED_REALLOC | ATTRS_NEED_REALLOC),
+  DEVICE_CURVE_DATA_NEEDS_REALLOC = (CURVE_DATA_NEED_REALLOC | ATTRS_NEED_REALLOC),
+};
+
 /* Geometry
 *
 * Base class for geometric types like Mesh and Hair. */
--- a/intern/cycles/scene/geometry_attributes.cpp
+++ b/intern/cycles/scene/geometry_attributes.cpp
@@ -0,0 +1,722 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "bvh/bvh.h"
+#include "bvh/bvh2.h"
+
+#include "device/device.h"
+
+#include "scene/attribute.h"
+#include "scene/camera.h"
+#include "scene/geometry.h"
+#include "scene/hair.h"
+#include "scene/light.h"
+#include "scene/mesh.h"
+#include "scene/object.h"
+#include "scene/pointcloud.h"
+#include "scene/scene.h"
+#include "scene/shader.h"
+#include "scene/shader_nodes.h"
+#include "scene/stats.h"
+#include "scene/volume.h"
+
+#include "subd/patch_table.h"
+#include "subd/split.h"
+
+#include "kernel/osl/globals.h"
+
+#include "util/foreach.h"
+#include "util/log.h"
+#include "util/progress.h"
+#include "util/task.h"
+
+CCL_NAMESPACE_BEGIN
+
+bool Geometry::need_attribute(Scene *scene, AttributeStandard std)
+{
+  if (std == ATTR_STD_NONE)
+    return false;
+
+  if (scene->need_global_attribute(std))
+    return true;
+
+  foreach (Node *node, used_shaders) {
+    Shader *shader = static_cast<Shader *>(node);
+    if (shader->attributes.find(std))
+      return true;
+  }
+
+  return false;
+}
+
+bool Geometry::need_attribute(Scene * /*scene*/, ustring name)
+{
+  if (name == ustring())
+    return false;
+
+  foreach (Node *node, used_shaders) {
+    Shader *shader = static_cast<Shader *>(node);
+    if (shader->attributes.find(name))
+      return true;
+  }
+
+  return false;
+}
+
+AttributeRequestSet Geometry::needed_attributes()
+{
+  AttributeRequestSet result;
+
+  foreach (Node *node, used_shaders) {
+    Shader *shader = static_cast<Shader *>(node);
+    result.add(shader->attributes);
+  }
+
+  return result;
+}
+
+bool Geometry::has_voxel_attributes() const
+{
+  foreach (const Attribute &attr, attributes.attributes) {
+    if (attr.element == ATTR_ELEMENT_VOXEL) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/* Generate a normal attribute map entry from an attribute descriptor. */
+static void emit_attribute_map_entry(AttributeMap *attr_map,
+                                     size_t index,
+                                     uint64_t id,
+                                     TypeDesc type,
+                                     const AttributeDescriptor &desc)
+{
+  attr_map[index].id = id;
+  attr_map[index].element = desc.element;
+  attr_map[index].offset = as_uint(desc.offset);
+
+  if (type == TypeDesc::TypeFloat)
+    attr_map[index].type = NODE_ATTR_FLOAT;
+  else if (type == TypeDesc::TypeMatrix)
+    attr_map[index].type = NODE_ATTR_MATRIX;
+  else if (type == TypeFloat2)
+    attr_map[index].type = NODE_ATTR_FLOAT2;
+  else if (type == TypeFloat4)
+    attr_map[index].type = NODE_ATTR_FLOAT4;
+  else if (type == TypeRGBA)
+    attr_map[index].type = NODE_ATTR_RGBA;
+  else
+    attr_map[index].type = NODE_ATTR_FLOAT3;
+
+  attr_map[index].flags = desc.flags;
+}
+
+/* Generate an attribute map end marker, optionally including a link to another map.
+ * Links are used to connect object attribute maps to mesh attribute maps. */
+static void emit_attribute_map_terminator(AttributeMap *attr_map,
+                                          size_t index,
+                                          bool chain,
+                                          uint chain_link)
+{
+  for (int j = 0; j < ATTR_PRIM_TYPES; j++) {
+    attr_map[index + j].id = ATTR_STD_NONE;
+    attr_map[index + j].element = chain;                     /* link is valid flag */
+    attr_map[index + j].offset = chain ? chain_link + j : 0; /* link to the correct sub-entry */
+    attr_map[index + j].type = 0;
+    attr_map[index + j].flags = 0;
+  }
+}
+
+/* Generate all necessary attribute map entries from the attribute request. */
+static void emit_attribute_mapping(
+    AttributeMap *attr_map, size_t index, uint64_t id, AttributeRequest &req, Geometry *geom)
+{
+  emit_attribute_map_entry(attr_map, index, id, req.type, req.desc);
+
+  if (geom->is_mesh()) {
+    Mesh *mesh = static_cast<Mesh *>(geom);
+    if (mesh->get_num_subd_faces()) {
+      emit_attribute_map_entry(attr_map, index + 1, id, req.subd_type, req.subd_desc);
+    }
+  }
+}
+
+void GeometryManager::update_svm_attributes(Device *,
+                                            DeviceScene *dscene,
+                                            Scene *scene,
+                                            vector<AttributeRequestSet> &geom_attributes,
+                                            vector<AttributeRequestSet> &object_attributes)
+{
+  /* for SVM, the attributes_map table is used to lookup the offset of an
+   * attribute, based on a unique shader attribute id. */
+
+  /* compute array stride */
+  size_t attr_map_size = 0;
+
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    geom->attr_map_offset = attr_map_size;
+
+#ifdef WITH_OSL
+    size_t attr_count = 0;
+    foreach (AttributeRequest &req, geom_attributes[i].requests) {
+      if (req.std != ATTR_STD_NONE &&
+          scene->shader_manager->get_attribute_id(req.std) != (uint64_t)req.std)
+        attr_count += 2;
+      else
+        attr_count += 1;
+    }
+#else
+    const size_t attr_count = geom_attributes[i].size();
+#endif
+
+    attr_map_size += (attr_count + 1) * ATTR_PRIM_TYPES;
+  }
+
+  for (size_t i = 0; i < scene->objects.size(); i++) {
+    Object *object = scene->objects[i];
+
+    /* only allocate a table for the object if it actually has attributes */
+    if (object_attributes[i].size() == 0) {
+      object->attr_map_offset = 0;
+    }
+    else {
+      object->attr_map_offset = attr_map_size;
+      attr_map_size += (object_attributes[i].size() + 1) * ATTR_PRIM_TYPES;
+    }
+  }
+
+  if (attr_map_size == 0)
+    return;
+
+  if (!dscene->attributes_map.need_realloc()) {
+    return;
+  }
+
+  /* create attribute map */
+  AttributeMap *attr_map = dscene->attributes_map.alloc(attr_map_size);
+  memset(attr_map, 0, dscene->attributes_map.size() * sizeof(*attr_map));
+
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    AttributeRequestSet &attributes = geom_attributes[i];
+
+    /* set geometry attributes */
+    size_t index = geom->attr_map_offset;
+
+    foreach (AttributeRequest &req, attributes.requests) {
+      uint64_t id;
+      if (req.std == ATTR_STD_NONE)
+        id = scene->shader_manager->get_attribute_id(req.name);
+      else
+        id = scene->shader_manager->get_attribute_id(req.std);
+
+      emit_attribute_mapping(attr_map, index, id, req, geom);
+      index += ATTR_PRIM_TYPES;
+
+#ifdef WITH_OSL
+      /* Some standard attributes are explicitly referenced via their standard ID, so add those
+       * again in case they were added under a different attribute ID. */
+      if (req.std != ATTR_STD_NONE && id != (uint64_t)req.std) {
+        emit_attribute_mapping(attr_map, index, (uint64_t)req.std, req, geom);
+        index += ATTR_PRIM_TYPES;
+      }
+#endif
+    }
+
+    emit_attribute_map_terminator(attr_map, index, false, 0);
+  }
+
+  for (size_t i = 0; i < scene->objects.size(); i++) {
+    Object *object = scene->objects[i];
+    AttributeRequestSet &attributes = object_attributes[i];
+
+    /* set object attributes */
+    if (attributes.size() > 0) {
+      size_t index = object->attr_map_offset;
+
+      foreach (AttributeRequest &req, attributes.requests) {
+        uint64_t id;
+        if (req.std == ATTR_STD_NONE)
+          id = scene->shader_manager->get_attribute_id(req.name);
+        else
+          id = scene->shader_manager->get_attribute_id(req.std);
+
+        emit_attribute_mapping(attr_map, index, id, req, object->geometry);
+        index += ATTR_PRIM_TYPES;
+      }
+
+      emit_attribute_map_terminator(attr_map, index, true, object->geometry->attr_map_offset);
+    }
+  }
+
+  /* copy to device */
+  dscene->attributes_map.copy_to_device();
+}
+
+void GeometryManager::update_attribute_element_offset(Geometry *geom,
+                                                      device_vector<float> &attr_float,
+                                                      size_t &attr_float_offset,
+                                                      device_vector<float2> &attr_float2,
+                                                      size_t &attr_float2_offset,
+                                                      device_vector<packed_float3> &attr_float3,
+                                                      size_t &attr_float3_offset,
+                                                      device_vector<float4> &attr_float4,
+                                                      size_t &attr_float4_offset,
+                                                      device_vector<uchar4> &attr_uchar4,
+                                                      size_t &attr_uchar4_offset,
+                                                      Attribute *mattr,
+                                                      AttributePrimitive prim,
+                                                      TypeDesc &type,
+                                                      AttributeDescriptor &desc)
+{
+  if (mattr) {
+    /* store element and type */
+    desc.element = mattr->element;
+    desc.flags = mattr->flags;
+    type = mattr->type;
+
+    /* store attribute data in arrays */
+    size_t size = mattr->element_size(geom, prim);
+
+    AttributeElement &element = desc.element;
+    int &offset = desc.offset;
+
+    if (mattr->element == ATTR_ELEMENT_VOXEL) {
+      /* store slot in offset value */
+      ImageHandle &handle = mattr->data_voxel();
+      offset = handle.svm_slot();
+    }
+    else if (mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
+      uchar4 *data = mattr->data_uchar4();
+      offset = attr_uchar4_offset;
+
+      assert(attr_uchar4.size() >= offset + size);
+      if (mattr->modified) {
+        for (size_t k = 0; k < size; k++) {
+          attr_uchar4[offset + k] = data[k];
+        }
+        attr_uchar4.tag_modified();
+      }
+      attr_uchar4_offset += size;
+    }
+    else if (mattr->type == TypeDesc::TypeFloat) {
+      float *data = mattr->data_float();
+      offset = attr_float_offset;
+
+      assert(attr_float.size() >= offset + size);
+      if (mattr->modified) {
+        for (size_t k = 0; k < size; k++) {
+          attr_float[offset + k] = data[k];
+        }
+        attr_float.tag_modified();
+      }
+      attr_float_offset += size;
+    }
+    else if (mattr->type == TypeFloat2) {
+      float2 *data = mattr->data_float2();
+      offset = attr_float2_offset;
+
+      assert(attr_float2.size() >= offset + size);
+      if (mattr->modified) {
+        for (size_t k = 0; k < size; k++) {
+          attr_float2[offset + k] = data[k];
+        }
+        attr_float2.tag_modified();
+      }
+      attr_float2_offset += size;
+    }
+    else if (mattr->type == TypeDesc::TypeMatrix) {
+      Transform *tfm = mattr->data_transform();
+      offset = attr_float4_offset;
+
+      assert(attr_float4.size() >= offset + size * 3);
+      if (mattr->modified) {
+        for (size_t k = 0; k < size * 3; k++) {
+          attr_float4[offset + k] = (&tfm->x)[k];
+        }
+        attr_float4.tag_modified();
+      }
+      attr_float4_offset += size * 3;
+    }
+    else if (mattr->type == TypeFloat4 || mattr->type == TypeRGBA) {
+      float4 *data = mattr->data_float4();
+      offset = attr_float4_offset;
+
+      assert(attr_float4.size() >= offset + size);
+      if (mattr->modified) {
+        for (size_t k = 0; k < size; k++) {
+          attr_float4[offset + k] = data[k];
+        }
+        attr_float4.tag_modified();
+      }
+      attr_float4_offset += size;
+    }
+    else {
+      float3 *data = mattr->data_float3();
+      offset = attr_float3_offset;
+
+      assert(attr_float3.size() >= offset + size);
+      if (mattr->modified) {
+        for (size_t k = 0; k < size; k++) {
+          attr_float3[offset + k] = data[k];
+        }
+        attr_float3.tag_modified();
+      }
+      attr_float3_offset += size;
+    }
+
+    /* mesh vertex/curve index is global, not per object, so we sneak
+     * a correction for that in here */
+    if (geom->is_mesh()) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+      if (mesh->subdivision_type == Mesh::SUBDIVISION_CATMULL_CLARK &&
+          desc.flags & ATTR_SUBDIVIDED) {
+        /* Indices for subdivided attributes are retrieved
+         * from patch table so no need for correction here. */
+      }
+      else if (element == ATTR_ELEMENT_VERTEX)
+        offset -= mesh->vert_offset;
+      else if (element == ATTR_ELEMENT_VERTEX_MOTION)
+        offset -= mesh->vert_offset;
+      else if (element == ATTR_ELEMENT_FACE) {
+        if (prim == ATTR_PRIM_GEOMETRY)
+          offset -= mesh->prim_offset;
+        else
+          offset -= mesh->face_offset;
+      }
+      else if (element == ATTR_ELEMENT_CORNER || element == ATTR_ELEMENT_CORNER_BYTE) {
+        if (prim == ATTR_PRIM_GEOMETRY)
+          offset -= 3 * mesh->prim_offset;
+        else
+          offset -= mesh->corner_offset;
+      }
+    }
+    else if (geom->is_hair()) {
+      Hair *hair = static_cast<Hair *>(geom);
+      if (element == ATTR_ELEMENT_CURVE)
+        offset -= hair->prim_offset;
+      else if (element == ATTR_ELEMENT_CURVE_KEY)
+        offset -= hair->curve_key_offset;
+      else if (element == ATTR_ELEMENT_CURVE_KEY_MOTION)
+        offset -= hair->curve_key_offset;
+    }
+    else if (geom->is_pointcloud()) {
+      if (element == ATTR_ELEMENT_VERTEX)
+        offset -= geom->prim_offset;
+      else if (element == ATTR_ELEMENT_VERTEX_MOTION)
+        offset -= geom->prim_offset;
+    }
+  }
+  else {
+    /* attribute not found */
+    desc.element = ATTR_ELEMENT_NONE;
+    desc.offset = 0;
+  }
+}
+
+static void update_attribute_element_size(Geometry *geom,
+                                          Attribute *mattr,
+                                          AttributePrimitive prim,
+                                          size_t *attr_float_size,
+                                          size_t *attr_float2_size,
+                                          size_t *attr_float3_size,
+                                          size_t *attr_float4_size,
+                                          size_t *attr_uchar4_size)
+{
+  if (mattr) {
+    size_t size = mattr->element_size(geom, prim);
+
+    if (mattr->element == ATTR_ELEMENT_VOXEL) {
+      /* pass */
+    }
+    else if (mattr->element == ATTR_ELEMENT_CORNER_BYTE) {
+      *attr_uchar4_size += size;
+    }
+    else if (mattr->type == TypeDesc::TypeFloat) {
+      *attr_float_size += size;
+    }
+    else if (mattr->type == TypeFloat2) {
+      *attr_float2_size += size;
+    }
+    else if (mattr->type == TypeDesc::TypeMatrix) {
+      *attr_float4_size += size * 4;
+    }
+    else if (mattr->type == TypeFloat4 || mattr->type == TypeRGBA) {
+      *attr_float4_size += size;
+    }
+    else {
+      *attr_float3_size += size;
+    }
+  }
+}
+
+void GeometryManager::device_update_attributes(Device *device,
+                                               DeviceScene *dscene,
+                                               Scene *scene,
+                                               Progress &progress)
+{
+  progress.set_status("Updating Mesh", "Computing attributes");
+
+  /* gather per mesh requested attributes. as meshes may have multiple
+   * shaders assigned, this merges the requested attributes that have
+   * been set per shader by the shader manager */
+  vector<AttributeRequestSet> geom_attributes(scene->geometry.size());
+
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+
+    geom->index = i;
+    scene->need_global_attributes(geom_attributes[i]);
+
+    foreach (Node *node, geom->get_used_shaders()) {
+      Shader *shader = static_cast<Shader *>(node);
+      geom_attributes[i].add(shader->attributes);
+    }
+
+    if (geom->is_hair() && static_cast<Hair *>(geom)->need_shadow_transparency()) {
+      geom_attributes[i].add(ATTR_STD_SHADOW_TRANSPARENCY);
+    }
+  }
+
+  /* convert object attributes to use the same data structures as geometry ones */
+  vector<AttributeRequestSet> object_attributes(scene->objects.size());
+  vector<AttributeSet> object_attribute_values;
+
+  object_attribute_values.reserve(scene->objects.size());
+
+  for (size_t i = 0; i < scene->objects.size(); i++) {
+    Object *object = scene->objects[i];
+    Geometry *geom = object->geometry;
+    size_t geom_idx = geom->index;
+
+    assert(geom_idx < scene->geometry.size() && scene->geometry[geom_idx] == geom);
+
+    object_attribute_values.push_back(AttributeSet(geom, ATTR_PRIM_GEOMETRY));
+
+    AttributeRequestSet &geom_requests = geom_attributes[geom_idx];
+    AttributeRequestSet &attributes = object_attributes[i];
+    AttributeSet &values = object_attribute_values[i];
+
+    for (size_t j = 0; j < object->attributes.size(); j++) {
+      ParamValue &param = object->attributes[j];
+
+      /* add attributes that are requested and not already handled by the mesh */
+      if (geom_requests.find(param.name()) && !geom->attributes.find(param.name())) {
+        attributes.add(param.name());
+
+        Attribute *attr = values.add(param.name(), param.type(), ATTR_ELEMENT_OBJECT);
+        assert(param.datasize() == attr->buffer.size());
+        memcpy(attr->buffer.data(), param.data(), param.datasize());
+      }
+    }
+  }
+
+  /* mesh attribute are stored in a single array per data type. here we fill
+   * those arrays, and set the offset and element type to create attribute
+   * maps next */
+
+  /* Pre-allocate attributes to avoid arrays re-allocation which would
+   * take 2x of overall attribute memory usage.
+   */
+  size_t attr_float_size = 0;
+  size_t attr_float2_size = 0;
+  size_t attr_float3_size = 0;
+  size_t attr_float4_size = 0;
+  size_t attr_uchar4_size = 0;
+
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    AttributeRequestSet &attributes = geom_attributes[i];
+    foreach (AttributeRequest &req, attributes.requests) {
+      Attribute *attr = geom->attributes.find(req);
+
+      update_attribute_element_size(geom,
+                                    attr,
+                                    ATTR_PRIM_GEOMETRY,
+                                    &attr_float_size,
+                                    &attr_float2_size,
+                                    &attr_float3_size,
+                                    &attr_float4_size,
+                                    &attr_uchar4_size);
+
+      if (geom->is_mesh()) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        Attribute *subd_attr = mesh->subd_attributes.find(req);
+
+        update_attribute_element_size(mesh,
+                                      subd_attr,
+                                      ATTR_PRIM_SUBD,
+                                      &attr_float_size,
+                                      &attr_float2_size,
+                                      &attr_float3_size,
+                                      &attr_float4_size,
+                                      &attr_uchar4_size);
+      }
+    }
+  }
+
+  for (size_t i = 0; i < scene->objects.size(); i++) {
+    Object *object = scene->objects[i];
+
+    foreach (Attribute &attr, object_attribute_values[i].attributes) {
+      update_attribute_element_size(object->geometry,
+                                    &attr,
+                                    ATTR_PRIM_GEOMETRY,
+                                    &attr_float_size,
+                                    &attr_float2_size,
+                                    &attr_float3_size,
+                                    &attr_float4_size,
+                                    &attr_uchar4_size);
+    }
+  }
+
+  dscene->attributes_float.alloc(attr_float_size);
+  dscene->attributes_float2.alloc(attr_float2_size);
+  dscene->attributes_float3.alloc(attr_float3_size);
+  dscene->attributes_float4.alloc(attr_float4_size);
+  dscene->attributes_uchar4.alloc(attr_uchar4_size);
+
+  /* The order of those flags needs to match that of AttrKernelDataType. */
+  const bool attributes_need_realloc[AttrKernelDataType::NUM] = {
+      dscene->attributes_float.need_realloc(),
+      dscene->attributes_float2.need_realloc(),
+      dscene->attributes_float3.need_realloc(),
+      dscene->attributes_float4.need_realloc(),
+      dscene->attributes_uchar4.need_realloc(),
+  };
+
+  size_t attr_float_offset = 0;
+  size_t attr_float2_offset = 0;
+  size_t attr_float3_offset = 0;
+  size_t attr_float4_offset = 0;
+  size_t attr_uchar4_offset = 0;
+
+  /* Fill in attributes. */
+  for (size_t i = 0; i < scene->geometry.size(); i++) {
+    Geometry *geom = scene->geometry[i];
+    AttributeRequestSet &attributes = geom_attributes[i];
+
+    /* todo: we now store std and name attributes from requests even if
+     * they actually refer to the same mesh attributes, optimize */
+    foreach (AttributeRequest &req, attributes.requests) {
+      Attribute *attr = geom->attributes.find(req);
+
+      if (attr) {
+        /* force a copy if we need to reallocate all the data */
+        attr->modified |= attributes_need_realloc[Attribute::kernel_type(*attr)];
+      }
+
+      update_attribute_element_offset(geom,
+                                      dscene->attributes_float,
+                                      attr_float_offset,
+                                      dscene->attributes_float2,
+                                      attr_float2_offset,
+                                      dscene->attributes_float3,
+                                      attr_float3_offset,
+                                      dscene->attributes_float4,
+                                      attr_float4_offset,
+                                      dscene->attributes_uchar4,
+                                      attr_uchar4_offset,
+                                      attr,
+                                      ATTR_PRIM_GEOMETRY,
+                                      req.type,
+                                      req.desc);
+
+      if (geom->is_mesh()) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        Attribute *subd_attr = mesh->subd_attributes.find(req);
+
+        if (subd_attr) {
+          /* force a copy if we need to reallocate all the data */
+          subd_attr->modified |= attributes_need_realloc[Attribute::kernel_type(*subd_attr)];
+        }
+
+        update_attribute_element_offset(mesh,
+                                        dscene->attributes_float,
+                                        attr_float_offset,
+                                        dscene->attributes_float2,
+                                        attr_float2_offset,
+                                        dscene->attributes_float3,
+                                        attr_float3_offset,
+                                        dscene->attributes_float4,
+                                        attr_float4_offset,
+                                        dscene->attributes_uchar4,
+                                        attr_uchar4_offset,
+                                        subd_attr,
+                                        ATTR_PRIM_SUBD,
+                                        req.subd_type,
+                                        req.subd_desc);
+      }
+
+      if (progress.get_cancel())
+        return;
+    }
+  }
+
+  for (size_t i = 0; i < scene->objects.size(); i++) {
+    Object *object = scene->objects[i];
+    AttributeRequestSet &attributes = object_attributes[i];
+    AttributeSet &values = object_attribute_values[i];
+
+    foreach (AttributeRequest &req, attributes.requests) {
+      Attribute *attr = values.find(req);
+
+      if (attr) {
+        attr->modified |= attributes_need_realloc[Attribute::kernel_type(*attr)];
+      }
+
+      update_attribute_element_offset(object->geometry,
+                                      dscene->attributes_float,
+                                      attr_float_offset,
+                                      dscene->attributes_float2,
+                                      attr_float2_offset,
+                                      dscene->attributes_float3,
+                                      attr_float3_offset,
+                                      dscene->attributes_float4,
+                                      attr_float4_offset,
+                                      dscene->attributes_uchar4,
+                                      attr_uchar4_offset,
+                                      attr,
+                                      ATTR_PRIM_GEOMETRY,
+                                      req.type,
+                                      req.desc);
+
+      /* object attributes don't care about subdivision */
+      req.subd_type = req.type;
+      req.subd_desc = req.desc;
+
+      if (progress.get_cancel())
+        return;
+    }
+  }
+
+  /* create attribute lookup maps */
+  if (scene->shader_manager->use_osl())
+    update_osl_globals(device, scene);
+
+  update_svm_attributes(device, dscene, scene, geom_attributes, object_attributes);
+
+  if (progress.get_cancel())
+    return;
+
+  /* copy to device */
+  progress.set_status("Updating Mesh", "Copying Attributes to device");
+
+  dscene->attributes_float.copy_to_device_if_modified();
+  dscene->attributes_float2.copy_to_device_if_modified();
+  dscene->attributes_float3.copy_to_device_if_modified();
+  dscene->attributes_float4.copy_to_device_if_modified();
+  dscene->attributes_uchar4.copy_to_device_if_modified();
+
+  if (progress.get_cancel())
+    return;
+
+  /* After mesh attributes and patch tables have been copied to device memory,
+   * we need to update offsets in the objects. */
+  scene->object_manager->device_update_geom_offsets(device, dscene, scene);
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/scene/geometry_bvh.cpp
+++ b/intern/cycles/scene/geometry_bvh.cpp
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "bvh/bvh.h"
+#include "bvh/bvh2.h"
+
+#include "device/device.h"
+
+#include "scene/attribute.h"
+#include "scene/camera.h"
+#include "scene/geometry.h"
+#include "scene/hair.h"
+#include "scene/light.h"
+#include "scene/mesh.h"
+#include "scene/object.h"
+#include "scene/pointcloud.h"
+#include "scene/scene.h"
+#include "scene/shader.h"
+#include "scene/shader_nodes.h"
+#include "scene/stats.h"
+#include "scene/volume.h"
+
+#include "subd/patch_table.h"
+#include "subd/split.h"
+
+#include "kernel/osl/globals.h"
+
+#include "util/foreach.h"
+#include "util/log.h"
+#include "util/progress.h"
+#include "util/task.h"
+
+CCL_NAMESPACE_BEGIN
+
+void Geometry::compute_bvh(Device *device,
+                           DeviceScene *dscene,
+                           SceneParams *params,
+                           Progress *progress,
+                           size_t n,
+                           size_t total)
+{
+  if (progress->get_cancel())
+    return;
+
+  compute_bounds();
+
+  const BVHLayout bvh_layout = BVHParams::best_bvh_layout(
+      params->bvh_layout, device->get_bvh_layout_mask(dscene->data.kernel_features));
+  if (need_build_bvh(bvh_layout)) {
+    string msg = "Updating Geometry BVH ";
+    if (name.empty())
+      msg += string_printf("%u/%u", (uint)(n + 1), (uint)total);
+    else
+      msg += string_printf("%s %u/%u", name.c_str(), (uint)(n + 1), (uint)total);
+
+    Object object;
+
+    /* Ensure all visibility bits are set at the geometry level BVH. In
+     * the object level BVH is where actual visibility is tested. */
+    object.set_is_shadow_catcher(true);
+    object.set_visibility(~0);
+
+    object.set_geometry(this);
+
+    vector<Geometry *> geometry;
+    geometry.push_back(this);
+    vector<Object *> objects;
+    objects.push_back(&object);
+
+    if (bvh && !need_update_rebuild) {
+      progress->set_status(msg, "Refitting BVH");
+
+      bvh->replace_geometry(geometry, objects);
+
+      device->build_bvh(bvh, *progress, true);
+    }
+    else {
+      progress->set_status(msg, "Building BVH");
+
+      BVHParams bparams;
+      bparams.use_spatial_split = params->use_bvh_spatial_split;
+      bparams.use_compact_structure = params->use_bvh_compact_structure;
+      bparams.bvh_layout = bvh_layout;
+      bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
+                                    params->use_bvh_unaligned_nodes;
+      bparams.num_motion_triangle_steps = params->num_bvh_time_steps;
+      bparams.num_motion_curve_steps = params->num_bvh_time_steps;
+      bparams.num_motion_point_steps = params->num_bvh_time_steps;
+      bparams.bvh_type = params->bvh_type;
+      bparams.curve_subdivisions = params->curve_subdivisions();
+
+      delete bvh;
+      bvh = BVH::create(bparams, geometry, objects, device);
+      MEM_GUARDED_CALL(progress, device->build_bvh, bvh, *progress, false);
+    }
+  }
+
+  need_update_rebuild = false;
+  need_update_bvh_for_offset = false;
+}
+
+void GeometryManager::device_update_bvh(Device *device,
+                                        DeviceScene *dscene,
+                                        Scene *scene,
+                                        Progress &progress)
+{
+  /* bvh build */
+  progress.set_status("Updating Scene BVH", "Building");
+
+  BVHParams bparams;
+  bparams.top_level = true;
+  bparams.bvh_layout = BVHParams::best_bvh_layout(
+      scene->params.bvh_layout, device->get_bvh_layout_mask(dscene->data.kernel_features));
+  bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
+  bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
+                                scene->params.use_bvh_unaligned_nodes;
+  bparams.num_motion_triangle_steps = scene->params.num_bvh_time_steps;
+  bparams.num_motion_curve_steps = scene->params.num_bvh_time_steps;
+  bparams.num_motion_point_steps = scene->params.num_bvh_time_steps;
+  bparams.bvh_type = scene->params.bvh_type;
+  bparams.curve_subdivisions = scene->params.curve_subdivisions();
+
+  VLOG_INFO << "Using " << bvh_layout_name(bparams.bvh_layout) << " layout.";
+
+  const bool can_refit = scene->bvh != nullptr &&
+                         (bparams.bvh_layout == BVHLayout::BVH_LAYOUT_OPTIX ||
+                          bparams.bvh_layout == BVHLayout::BVH_LAYOUT_METAL);
+
+  BVH *bvh = scene->bvh;
+  if (!scene->bvh) {
+    bvh = scene->bvh = BVH::create(bparams, scene->geometry, scene->objects, device);
+  }
+
+  device->build_bvh(bvh, progress, can_refit);
+
+  if (progress.get_cancel()) {
+    return;
+  }
+
+  const bool has_bvh2_layout = (bparams.bvh_layout == BVH_LAYOUT_BVH2);
+
+  PackedBVH pack;
+  if (has_bvh2_layout) {
+    pack = std::move(static_cast<BVH2 *>(bvh)->pack);
+  }
+  else {
+    pack.root_index = -1;
+  }
+
+  /* copy to device */
+  progress.set_status("Updating Scene BVH", "Copying BVH to device");
+
+  /* When using BVH2, we always have to copy/update the data as its layout is dependent on the
+   * BVH's leaf nodes which may be different when the objects or vertices move. */
+
+  if (pack.nodes.size()) {
+    dscene->bvh_nodes.steal_data(pack.nodes);
+    dscene->bvh_nodes.copy_to_device();
+  }
+  if (pack.leaf_nodes.size()) {
+    dscene->bvh_leaf_nodes.steal_data(pack.leaf_nodes);
+    dscene->bvh_leaf_nodes.copy_to_device();
+  }
+  if (pack.object_node.size()) {
+    dscene->object_node.steal_data(pack.object_node);
+    dscene->object_node.copy_to_device();
+  }
+  if (pack.prim_type.size()) {
+    dscene->prim_type.steal_data(pack.prim_type);
+    dscene->prim_type.copy_to_device();
+  }
+  if (pack.prim_visibility.size()) {
+    dscene->prim_visibility.steal_data(pack.prim_visibility);
+    dscene->prim_visibility.copy_to_device();
+  }
+  if (pack.prim_index.size()) {
+    dscene->prim_index.steal_data(pack.prim_index);
+    dscene->prim_index.copy_to_device();
+  }
+  if (pack.prim_object.size()) {
+    dscene->prim_object.steal_data(pack.prim_object);
+    dscene->prim_object.copy_to_device();
+  }
+  if (pack.prim_time.size()) {
+    dscene->prim_time.steal_data(pack.prim_time);
+    dscene->prim_time.copy_to_device();
+  }
+
+  dscene->data.bvh.root = pack.root_index;
+  dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
+  dscene->data.bvh.curve_subdivisions = scene->params.curve_subdivisions();
+  /* The scene handle is set in 'CPUDevice::const_copy_to' and 'OptiXDevice::const_copy_to' */
+  dscene->data.device_bvh = 0;
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/scene/geometry_mesh.cpp
+++ b/intern/cycles/scene/geometry_mesh.cpp
@@ -0,0 +1,223 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "bvh/bvh.h"
+#include "bvh/bvh2.h"
+
+#include "device/device.h"
+
+#include "scene/attribute.h"
+#include "scene/camera.h"
+#include "scene/geometry.h"
+#include "scene/hair.h"
+#include "scene/light.h"
+#include "scene/mesh.h"
+#include "scene/object.h"
+#include "scene/osl.h"
+#include "scene/pointcloud.h"
+#include "scene/scene.h"
+#include "scene/shader.h"
+#include "scene/shader_nodes.h"
+#include "scene/stats.h"
+#include "scene/volume.h"
+
+#include "subd/patch_table.h"
+#include "subd/split.h"
+
+#ifdef WITH_OSL
+#  include "kernel/osl/globals.h"
+#endif
+
+#include "util/foreach.h"
+#include "util/log.h"
+#include "util/progress.h"
+#include "util/task.h"
+
+CCL_NAMESPACE_BEGIN
+
+void GeometryManager::device_update_mesh(Device *,
+                                         DeviceScene *dscene,
+                                         Scene *scene,
+                                         Progress &progress)
+{
+  /* Count. */
+  size_t vert_size = 0;
+  size_t tri_size = 0;
+
+  size_t curve_key_size = 0;
+  size_t curve_size = 0;
+  size_t curve_segment_size = 0;
+
+  size_t point_size = 0;
+
+  size_t patch_size = 0;
+
+  foreach (Geometry *geom, scene->geometry) {
+    if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
+      Mesh *mesh = static_cast<Mesh *>(geom);
+
+      vert_size += mesh->verts.size();
+      tri_size += mesh->num_triangles();
+
+      if (mesh->get_num_subd_faces()) {
+        Mesh::SubdFace last = mesh->get_subd_face(mesh->get_num_subd_faces() - 1);
+        patch_size += (last.ptex_offset + last.num_ptex_faces()) * 8;
+
+        /* patch tables are stored in same array so include them in patch_size */
+        if (mesh->patch_table) {
+          mesh->patch_table_offset = patch_size;
+          patch_size += mesh->patch_table->total_size();
+        }
+      }
+    }
+    else if (geom->is_hair()) {
+      Hair *hair = static_cast<Hair *>(geom);
+
+      curve_key_size += hair->get_curve_keys().size();
+      curve_size += hair->num_curves();
+      curve_segment_size += hair->num_segments();
+    }
+    else if (geom->is_pointcloud()) {
+      PointCloud *pointcloud = static_cast<PointCloud *>(geom);
+      point_size += pointcloud->num_points();
+    }
+  }
+
+  /* Fill in all the arrays. */
+  if (tri_size != 0) {
+    /* normals */
+    progress.set_status("Updating Mesh", "Computing normals");
+
+    packed_float3 *tri_verts = dscene->tri_verts.alloc(vert_size);
+    uint *tri_shader = dscene->tri_shader.alloc(tri_size);
+    packed_float3 *vnormal = dscene->tri_vnormal.alloc(vert_size);
+    packed_uint3 *tri_vindex = dscene->tri_vindex.alloc(tri_size);
+    uint *tri_patch = dscene->tri_patch.alloc(tri_size);
+    float2 *tri_patch_uv = dscene->tri_patch_uv.alloc(vert_size);
+
+    const bool copy_all_data = dscene->tri_shader.need_realloc() ||
+                               dscene->tri_vindex.need_realloc() ||
+                               dscene->tri_vnormal.need_realloc() ||
+                               dscene->tri_patch.need_realloc() ||
+                               dscene->tri_patch_uv.need_realloc();
+
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+
+        if (mesh->shader_is_modified() || mesh->smooth_is_modified() ||
+            mesh->triangles_is_modified() || copy_all_data) {
+          mesh->pack_shaders(scene, &tri_shader[mesh->prim_offset]);
+        }
+
+        if (mesh->verts_is_modified() || copy_all_data) {
+          mesh->pack_normals(&vnormal[mesh->vert_offset]);
+        }
+
+        if (mesh->verts_is_modified() || mesh->triangles_is_modified() ||
+            mesh->vert_patch_uv_is_modified() || copy_all_data) {
+          mesh->pack_verts(&tri_verts[mesh->vert_offset],
+                           &tri_vindex[mesh->prim_offset],
+                           &tri_patch[mesh->prim_offset],
+                           &tri_patch_uv[mesh->vert_offset]);
+        }
+
+        if (progress.get_cancel())
+          return;
+      }
+    }
+
+    /* vertex coordinates */
+    progress.set_status("Updating Mesh", "Copying Mesh to device");
+
+    dscene->tri_verts.copy_to_device_if_modified();
+    dscene->tri_shader.copy_to_device_if_modified();
+    dscene->tri_vnormal.copy_to_device_if_modified();
+    dscene->tri_vindex.copy_to_device_if_modified();
+    dscene->tri_patch.copy_to_device_if_modified();
+    dscene->tri_patch_uv.copy_to_device_if_modified();
+  }
+
+  if (curve_segment_size != 0) {
+    progress.set_status("Updating Mesh", "Copying Curves to device");
+
+    float4 *curve_keys = dscene->curve_keys.alloc(curve_key_size);
+    KernelCurve *curves = dscene->curves.alloc(curve_size);
+    KernelCurveSegment *curve_segments = dscene->curve_segments.alloc(curve_segment_size);
+
+    const bool copy_all_data = dscene->curve_keys.need_realloc() ||
+                               dscene->curves.need_realloc() ||
+                               dscene->curve_segments.need_realloc();
+
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->is_hair()) {
+        Hair *hair = static_cast<Hair *>(geom);
+
+        bool curve_keys_co_modified = hair->curve_radius_is_modified() ||
+                                      hair->curve_keys_is_modified();
+        bool curve_data_modified = hair->curve_shader_is_modified() ||
+                                   hair->curve_first_key_is_modified();
+
+        if (!curve_keys_co_modified && !curve_data_modified && !copy_all_data) {
+          continue;
+        }
+
+        hair->pack_curves(scene,
+                          &curve_keys[hair->curve_key_offset],
+                          &curves[hair->prim_offset],
+                          &curve_segments[hair->curve_segment_offset]);
+        if (progress.get_cancel())
+          return;
+      }
+    }
+
+    dscene->curve_keys.copy_to_device_if_modified();
+    dscene->curves.copy_to_device_if_modified();
+    dscene->curve_segments.copy_to_device_if_modified();
+  }
+
+  if (point_size != 0) {
+    progress.set_status("Updating Mesh", "Copying Point clouds to device");
+
+    float4 *points = dscene->points.alloc(point_size);
+    uint *points_shader = dscene->points_shader.alloc(point_size);
+
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->is_pointcloud()) {
+        PointCloud *pointcloud = static_cast<PointCloud *>(geom);
+        pointcloud->pack(
+            scene, &points[pointcloud->prim_offset], &points_shader[pointcloud->prim_offset]);
+        if (progress.get_cancel())
+          return;
+      }
+    }
+
+    dscene->points.copy_to_device();
+    dscene->points_shader.copy_to_device();
+  }
+
+  if (patch_size != 0 && dscene->patches.need_realloc()) {
+    progress.set_status("Updating Mesh", "Copying Patches to device");
+
+    uint *patch_data = dscene->patches.alloc(patch_size);
+
+    foreach (Geometry *geom, scene->geometry) {
+      if (geom->is_mesh()) {
+        Mesh *mesh = static_cast<Mesh *>(geom);
+        mesh->pack_patches(&patch_data[mesh->patch_offset]);
+
+        if (mesh->patch_table) {
+          mesh->patch_table->copy_adjusting_offsets(&patch_data[mesh->patch_table_offset],
+                                                    mesh->patch_table_offset);
+        }
+
+        if (progress.get_cancel())
+          return;
+      }
+    }
+
+    dscene->patches.copy_to_device();
+  }
+}
+
+CCL_NAMESPACE_END
--- a/Show More
+++ b/Show More