GPU: Cleanup GPU_batch.h documentation and some of the API for consistency

Documented all functions, adding use case and side effects. Also replace the use of shortened argument name by more meaningful ones. Renamed `GPU_batch_instbuf_add_ex` and `GPU_batch_vertbuf_add_ex` to remove the `ex` suffix as they are the main version used (removed the few usage of the other version). Renamed `GPU_batch_draw_instanced` to `GPU_batch_draw_instance_range` and make it consistent with `GPU_batch_draw_range`.
Cleanup: Move 6 sculpt-session-related files and header to C++
2023-02-11 12:52:58 -08:00 · 2023-02-11 12:52:55 -08:00 · 2023-02-11 12:50:45 -08:00 · 2023-02-11 12:50:45 -08:00 · 2023-02-11 12:50:45 -08:00 · 2023-02-11 12:50:45 -08:00
1016 changed files with 50482 additions and 34960 deletions
--- a/.gitea/default_merge_message/REBASE_TEMPLATE.md
+++ b/.gitea/default_merge_message/REBASE_TEMPLATE.md
@@ -0,0 +1,5 @@
+${CommitTitle}
+
+${CommitBody}
+
+Pull Request #${PullRequestIndex}
--- a/.gitea/default_merge_message/SQUASH_TEMPLATE.md
+++ b/.gitea/default_merge_message/SQUASH_TEMPLATE.md
@@ -0,0 +1,3 @@
+${PullRequestTitle}
+
+Pull Request #${PullRequestIndex}
--- a/.gitea/issue_template/bug.yaml
+++ b/.gitea/issue_template/bug.yaml
@@ -1,13 +1,15 @@
 name: Bug Report
 about: File a bug report
 labels:
-  - bug
+  - "type::Report"
+  - "status::Needs Triage"
+  - "priority::Normal"
 body:
  - type: markdown
    attributes:
      value: |
                ### Instructions
-                First time reporting? See [tips](https://wiki.blender.org/wiki/Process/Bug_Reports) and [walkthrough video](https://www.youtube.com/watch?v=JTD0OJq_rF4).
+                First time reporting? See [tips](https://wiki.blender.org/wiki/Process/Bug_Reports).

                * Use **Help > Report a Bug** in Blender to fill system information and exact Blender version.
                * Test [daily builds](https://builder.blender.org/) to verify if the issue is already fixed.
@@ -19,6 +21,7 @@ body:
    id: body
    attributes:
      label: "Description"
+      hide_label: true
      value: |
               **System Information**
               Operating system:
--- a/.gitea/issue_template/design.yaml
+++ b/.gitea/issue_template/design.yaml
@@ -1,9 +1,10 @@
 name: Design
 about: Create a design task (for developers only)
 labels:
-  - design
+  - "type::Design"
 body:
  - type: textarea
    id: body
    attributes:
      label: "Description"
+      hide_label: true
--- a/.gitea/issue_template/todo.yaml
+++ b/.gitea/issue_template/todo.yaml
@@ -1,9 +1,10 @@
 name: To Do
 about: Create a to do task (for developers only)
 labels:
-  - todo
+  - "type::To Do"
 body:
  - type: textarea
    id: body
    attributes:
      label: "Description"
+      hide_label: true
--- a/.gitea/pull_request_template.yaml
+++ b/.gitea/pull_request_template.yaml
@@ -14,7 +14,4 @@ body:
    id: body
    attributes:
      label: "Description"
-      value: |
-               Description of the problem that is addressed in the patch.
-
-               Description of the proposed solution and its implementation.
+      hide_label: true
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,5 +1,4 @@
-This repository is only used as a mirror of git.blender.org. Blender development happens on
-https://developer.blender.org.
+This repository is only used as a mirror. Blender development happens on projects.blender.org.

 To get started with contributing code, please see:
 https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -15,8 +15,7 @@ staleLabel: stale
 # Comment to post when closing a stale Issue or Pull Request.
 closeComment: >
  This issue has been automatically closed, because this repository is only
-  used as a mirror of git.blender.org. Blender development happens on
-  developer.blender.org.
+  used as a mirror. Blender development happens on projects.blender.org.

  To get started contributing code, please read:
  https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,20 +1,16 @@
 [submodule "release/scripts/addons"]
 	path = release/scripts/addons
 	url = ../blender-addons.git
-	branch = master
-	ignore = all
+	branch = main
 [submodule "release/scripts/addons_contrib"]
 	path = release/scripts/addons_contrib
 	url = ../blender-addons-contrib.git
-	branch = master
-	ignore = all
+	branch = main
 [submodule "release/datafiles/locale"]
 	path = release/datafiles/locale
 	url = ../blender-translations.git
-	branch = master
-	ignore = all
+	branch = main
 [submodule "source/tools"]
 	path = source/tools
 	url = ../blender-dev-tools.git
-	branch = master
-	ignore = all
+	branch = main
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -167,14 +167,26 @@ get_blender_version()
 option(WITH_BLENDER "Build blender (disable to build only the blender player)" ON)
 mark_as_advanced(WITH_BLENDER)

-if(APPLE)
-  # In future, can be used with `quicklookthumbnailing/qlthumbnailreply` to create file
-  # thumbnails for say Finder. Turn it off for now.
-  option(WITH_BLENDER_THUMBNAILER "Build \"blender-thumbnailer\" thumbnail extraction utility" OFF)
-elseif(WIN32)
-  option(WITH_BLENDER_THUMBNAILER "Build \"BlendThumb.dll\" helper for Windows explorer integration" ON)
+if(WIN32)
+  option(WITH_BLENDER_THUMBNAILER "\
+Build \"BlendThumb.dll\" helper for Windows explorer integration to support extracting \
+thumbnails from `.blend` files."
+    ON
+  )
 else()
-  option(WITH_BLENDER_THUMBNAILER "Build \"blender-thumbnailer\" thumbnail extraction utility" ON)
+  set(_option_default ON)
+  if(APPLE)
+    # In future, can be used with `quicklookthumbnailing/qlthumbnailreply`
+    # to create file thumbnails for say Finder.
+    # Turn it off for now, even though it can build on APPLE, it's not likely to be useful.
+    set(_option_default OFF)
+  endif()
+  option(WITH_BLENDER_THUMBNAILER "\
+Build stand-alone \"blender-thumbnailer\" command-line thumbnail extraction utility, \
+intended for use by file-managers to extract PNG images from `.blend` files."
+    ${_option_default}
+  )
+  unset(_option_default)
 endif()

 option(WITH_INTERNATIONAL "Enable I18N (International fonts and text)" ON)
@@ -214,14 +226,19 @@ option(WITH_BULLET        "Enable Bullet (Physics Engine)" ON)
 option(WITH_SYSTEM_BULLET "Use the systems bullet library (currently unsupported due to missing features in upstream!)" )
 mark_as_advanced(WITH_SYSTEM_BULLET)
 option(WITH_OPENCOLORIO   "Enable OpenColorIO color management" ON)
+
+set(_option_default ON)
 if(APPLE)
  # There's no OpenXR runtime in sight for macOS, neither is code well
  # tested there -> disable it by default.
-  option(WITH_XR_OPENXR   "Enable VR features through the OpenXR specification" OFF)
-  mark_as_advanced(WITH_XR_OPENXR)
-else()
-  option(WITH_XR_OPENXR   "Enable VR features through the OpenXR specification" ON)
+  set(_option_default OFF)
 endif()
+option(WITH_XR_OPENXR "Enable VR features through the OpenXR specification" ${_option_default})
+if(APPLE)
+  mark_as_advanced(WITH_XR_OPENXR)
+endif()
+unset(_option_default)
+
 option(WITH_GMP "Enable features depending on GMP (Exact Boolean)" ON)

 # Compositor
@@ -353,12 +370,13 @@ else()
  set(WITH_COREAUDIO OFF)
 endif()
 if(NOT WIN32)
+  set(_option_default ON)
  if(APPLE)
-    option(WITH_JACK          "Enable JACK Support (http://www.jackaudio.org)" OFF)
-  else()
-    option(WITH_JACK          "Enable JACK Support (http://www.jackaudio.org)" ON)
+    set(_option_default OFF)
  endif()
-  option(WITH_JACK_DYNLOAD  "Enable runtime dynamic JACK libraries loading" OFF)
+  option(WITH_JACK "Enable JACK Support (http://www.jackaudio.org)" ${_option_default})
+  unset(_option_default)
+  option(WITH_JACK_DYNLOAD "Enable runtime dynamic JACK libraries loading" OFF)
 else()
  set(WITH_JACK OFF)
 endif()
@@ -506,7 +524,7 @@ endif()
 if(NOT APPLE)
  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" ON)
  option(WITH_CYCLES_HIP_BINARIES      "Build Cycles AMD HIP binaries" OFF)
-  set(CYCLES_HIP_BINARIES_ARCH gfx900 gfx906 gfx90c gfx902 gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
+  set(CYCLES_HIP_BINARIES_ARCH gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
  mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
  mark_as_advanced(CYCLES_HIP_BINARIES_ARCH)
 endif()
@@ -1223,13 +1241,6 @@ if(WITH_OPENGL)
  add_definitions(-DWITH_OPENGL)
 endif()

-#-----------------------------------------------------------------------------
-# Configure Vulkan.
-
-if(WITH_VULKAN_BACKEND)
-  list(APPEND BLENDER_GL_LIBRARIES ${VULKAN_LIBRARIES})
-endif()
-
 # -----------------------------------------------------------------------------
 # Configure Metal

--- a/11
+++ b/11
@@ -71,6 +71,13 @@ Static Source Code Checking
   * check_mypy:            Checks all Python scripts using mypy,
                            see: source/tools/check_source/check_mypy_config.py scripts which are included.

+Documentation Checking
+
+   * check_wiki_file_structure:
+     Check the WIKI documentation for the source-tree's file structure
+     matches Blender's source-code.
+     See: https://wiki.blender.org/wiki/Source/File_Structure
+
 Spell Checkers
   This runs the spell checker from the developer tools repositor.

@@ -481,6 +488,10 @@ check_smatch: .FORCE
 check_mypy: .FORCE
 	@$(PYTHON) "$(BLENDER_DIR)/source/tools/check_source/check_mypy.py"

+check_wiki_file_structure: .FORCE
+	@PYTHONIOENCODING=utf_8 $(PYTHON) \
+	    "$(BLENDER_DIR)/source/tools/check_wiki/check_wiki_file_structure.py"
+
 check_spelling_py: .FORCE
 	@cd "$(BUILD_DIR)" ; \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Development
 -----------

 - [Build Instructions](https://wiki.blender.org/wiki/Building_Blender)
- [Code Review & Bug Tracker](https://developer.blender.org)
+- [Code Review & Bug Tracker](https://projects.blender.org)
 - [Developer Forum](https://devtalk.blender.org)
 - [Developer Documentation](https://wiki.blender.org)

--- a/build_files/build_environment/cmake/dpcpp.cmake
+++ b/build_files/build_environment/cmake/dpcpp.cmake
@@ -2,7 +2,7 @@

 # LLVM does not switch over to cpp17 until llvm 16 and building ealier versions with
 # MSVC is leading to some crashes in ISPC. Switch back to their default on all platforms
-# for now. 
+# for now.
 string(REPLACE "-DCMAKE_CXX_STANDARD=17" " " DPCPP_CMAKE_FLAGS "${DEFAULT_CMAKE_FLAGS}")

 if(WIN32)
--- a/build_files/build_environment/cmake/igc.cmake
+++ b/build_files/build_environment/cmake/igc.cmake
@@ -40,7 +40,8 @@ ExternalProject_Add(external_igc_llvm
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0004-OpenCL-support-cl_ext_float_atomics.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0005-OpenCL-Add-cl_khr_integer_dot_product.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0001-Memory-leak-fix-for-Managed-Static-Mutex.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch
+    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch &&
+    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0003-Add-missing-include-limit-in-benchmark.patch
 )
 add_dependencies(
  external_igc_llvm
@@ -55,9 +56,6 @@ ExternalProject_Add(external_igc_spirv_translator
  CONFIGURE_COMMAND echo .
  BUILD_COMMAND echo .
  INSTALL_COMMAND echo .
-  PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0001-update-SPIR-V-headers-for-SPV_INTEL_split_barrier.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0002-Add-support-for-split-barriers-extension-SPV_INTEL_s.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0003-Support-cl_bf16_conversions.patch
 )
 add_dependencies(
  external_igc_spirv_translator
--- a/build_files/build_environment/cmake/llvm.cmake
+++ b/build_files/build_environment/cmake/llvm.cmake
@@ -42,7 +42,7 @@ endif()

 # LLVM does not switch over to cpp17 until llvm 16 and building ealier versions with
 # MSVC is leading to some crashes in ISPC. Switch back to their default on all platforms
-# for now. 
+# for now.
 string(REPLACE "-DCMAKE_CXX_STANDARD=17" " " LLVM_CMAKE_FLAGS "${DEFAULT_CMAKE_FLAGS}")

 # short project name due to long filename issues on windows
--- a/build_files/build_environment/cmake/python.cmake
+++ b/build_files/build_environment/cmake/python.cmake
@@ -88,6 +88,19 @@ else()
    export LDFLAGS=${PYTHON_LDFLAGS} &&
    export PKG_CONFIG_PATH=${LIBDIR}/ffi/lib/pkgconfig)

+  # NOTE: untested on APPLE so far.
+  if(NOT APPLE)
+    set(PYTHON_CONFIGURE_EXTRA_ARGS
+      ${PYTHON_CONFIGURE_EXTRA_ARGS}
+      # Used on most release Linux builds (Fedora for e.g.),
+      # increases build times noticeably with the benefit of a modest speedup at runtime.
+      --enable-optimizations
+      # While LTO is OK when building on the same system, it's incompatible across GCC versions,
+      # making it impractical for developers to build against, so keep it disabled.
+      # `--with-lto`
+    )
+  endif()
+
  ExternalProject_Add(external_python
    URL file://${PACKAGE_DIR}/${PYTHON_FILE}
    DOWNLOAD_DIR ${DOWNLOAD_DIR}
--- a/build_files/build_environment/cmake/ssl.cmake
+++ b/build_files/build_environment/cmake/ssl.cmake
@@ -10,9 +10,9 @@ if(WIN32)
    DOWNLOAD_DIR ${DOWNLOAD_DIR}
    URL_HASH ${SSL_HASH_TYPE}=${SSL_HASH}
    PREFIX ${BUILD_DIR}/ssl
-    CONFIGURE_COMMAND echo "." 
-    BUILD_COMMAND echo "." 
-    INSTALL_COMMAND echo "." 
+    CONFIGURE_COMMAND echo "."
+    BUILD_COMMAND echo "."
+    INSTALL_COMMAND echo "."
    INSTALL_DIR ${LIBDIR}/ssl
  )
 else()
@@ -46,4 +46,4 @@ else()
    INSTALL_COMMAND ${CONFIGURE_ENV} && cd ${BUILD_DIR}/ssl/src/external_ssl/ && make install
    INSTALL_DIR ${LIBDIR}/ssl
  )
-endif()
+endif()
--- a/build_files/build_environment/cmake/versions.cmake
+++ b/build_files/build_environment/cmake/versions.cmake
@@ -668,9 +668,9 @@ set(SPIRV_HEADERS_FILE SPIR-V-Headers-${SPIRV_HEADERS_VERSION}.tar.gz)
 # compiler, the versions used are taken from the following location
 # https://github.com/intel/intel-graphics-compiler/releases

-set(IGC_VERSION 1.0.12149.1)
+set(IGC_VERSION 1.0.13064.7)
 set(IGC_URI https://github.com/intel/intel-graphics-compiler/archive/refs/tags/igc-${IGC_VERSION}.tar.gz)
-set(IGC_HASH 44f67f24e3bc5130f9f062533abf8154782a9d0a992bc19b498639a8521ae836)
+set(IGC_HASH a929abd4cca2b293961ec0437ee4b3b2147bd3b2c8a3c423af78c0c359b2e5ae)
 set(IGC_HASH_TYPE SHA256)
 set(IGC_FILE igc-${IGC_VERSION}.tar.gz)

@@ -690,15 +690,15 @@ set(IGC_LLVM_FILE ${IGC_LLVM_VERSION}.tar.gz)
 #
 # WARNING WARNING WARNING

-set(IGC_OPENCL_CLANG_VERSION 363a5262d8c7cff3fb28f3bdb5d85c8d7e91c1bb)
+set(IGC_OPENCL_CLANG_VERSION ee31812ea8b89d08c2918f045d11a19bd33525c5)
 set(IGC_OPENCL_CLANG_URI https://github.com/intel/opencl-clang/archive/${IGC_OPENCL_CLANG_VERSION}.tar.gz)
-set(IGC_OPENCL_CLANG_HASH aa8cf72bb239722ce8ce44f79413c6887ecc8ca18477dd520aa5c4809756da9a)
+set(IGC_OPENCL_CLANG_HASH 1db6735bbcfaa31e8a9ba39f121d6bafa806ea8919e9f56782d6aaa67771ddda)
 set(IGC_OPENCL_CLANG_HASH_TYPE SHA256)
 set(IGC_OPENCL_CLANG_FILE opencl-clang-${IGC_OPENCL_CLANG_VERSION}.tar.gz)

-set(IGC_VCINTRINSICS_VERSION v0.5.0)
+set(IGC_VCINTRINSICS_VERSION v0.11.0)
 set(IGC_VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/refs/tags/${IGC_VCINTRINSICS_VERSION}.tar.gz)
-set(IGC_VCINTRINSICS_HASH 70bb47c5e32173cf61514941e83ae7c7eb4485e6d2fca60cfa1f50d4f42c41f2)
+set(IGC_VCINTRINSICS_HASH e5acd5626ce7fa6d41ce154c50ac805eda734ee66af94ef28e680ac2ad81bb9f)
 set(IGC_VCINTRINSICS_HASH_TYPE SHA256)
 set(IGC_VCINTRINSICS_FILE vc-intrinsics-${IGC_VCINTRINSICS_VERSION}.tar.gz)

@@ -714,9 +714,9 @@ set(IGC_SPIRV_TOOLS_HASH 6e19900e948944243024aedd0a201baf3854b377b9cc7a386553bc1
 set(IGC_SPIRV_TOOLS_HASH_TYPE SHA256)
 set(IGC_SPIRV_TOOLS_FILE SPIR-V-Tools-${IGC_SPIRV_TOOLS_VERSION}.tar.gz)

-set(IGC_SPIRV_TRANSLATOR_VERSION a31ffaeef77e23d500b3ea3d35e0c42ff5648ad9)
+set(IGC_SPIRV_TRANSLATOR_VERSION d739c01d65ec00dee64dedd40deed805216a7193)
 set(IGC_SPIRV_TRANSLATOR_URI https://github.com/KhronosGroup/SPIRV-LLVM-Translator/archive/${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
-set(IGC_SPIRV_TRANSLATOR_HASH 9e26c96a45341b8f8af521bacea20e752623346340addd02af95d669f6e89252)
+set(IGC_SPIRV_TRANSLATOR_HASH ddc0cc9ccbe59dadeaf291012d59de142b2e9f2b124dbb634644d39daddaa13e)
 set(IGC_SPIRV_TRANSLATOR_HASH_TYPE SHA256)
 set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)

@@ -724,15 +724,15 @@ set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.
 ### Intel Graphics Compiler DEPS END ###
 ########################################

-set(GMMLIB_VERSION intel-gmmlib-22.1.8)
+set(GMMLIB_VERSION intel-gmmlib-22.3.0)
 set(GMMLIB_URI https://github.com/intel/gmmlib/archive/refs/tags/${GMMLIB_VERSION}.tar.gz)
-set(GMMLIB_HASH bf23e9a3742b4fb98c7666c9e9b29f3219e4b2fb4d831aaf4eed71f5e2d17368)
+set(GMMLIB_HASH c1f33e1519edfc527127baeb0436b783430dfd256c643130169a3a71dc86aff9)
 set(GMMLIB_HASH_TYPE SHA256)
 set(GMMLIB_FILE ${GMMLIB_VERSION}.tar.gz)

-set(OCLOC_VERSION 22.38.24278)
+set(OCLOC_VERSION 22.49.25018.21)
 set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC_VERSION}.tar.gz)
-set(OCLOC_HASH db0c542fccd651e6404b15a74d46027f1ce0eda8dc9e25a40cbb6c0faef257ee)
+set(OCLOC_HASH 92362dae08b503a34e5d3820ed284198c452bcd5e7504d90eb69887b20492c06)
 set(OCLOC_HASH_TYPE SHA256)
 set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)

--- a/build_files/build_environment/cmake/xml2.cmake
+++ b/build_files/build_environment/cmake/xml2.cmake
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-or-later

 if(WIN32)
-  set(XML2_EXTRA_ARGS 
+  set(XML2_EXTRA_ARGS
    -DLIBXML2_WITH_ZLIB=OFF
    -DLIBXML2_WITH_LZMA=OFF
    -DLIBXML2_WITH_PYTHON=OFF
--- a/build_files/build_environment/patches/igc_opencl_clang.diff
+++ b/build_files/build_environment/patches/igc_opencl_clang.diff
@@ -1,7 +1,7 @@
 diff -Naur external_igc_opencl_clang.orig/CMakeLists.txt external_igc_opencl_clang/CMakeLists.txt
 --- external_igc_opencl_clang.orig/CMakeLists.txt	2022-03-16 05:51:10 -0600
 +++ external_igc_opencl_clang/CMakeLists.txt	2022-05-23 10:40:09 -0600
-@@ -126,22 +126,24 @@
+@@ -147,22 +147,24 @@
         )
     endif()
 
--- a/build_files/cmake/Modules/FindMoltenVK.cmake
+++ b/build_files/cmake/Modules/FindMoltenVK.cmake
@@ -24,7 +24,7 @@ SET(_moltenvk_SEARCH_DIRS
 # FIXME: These finder modules typically don't use LIBDIR,
 # this should be set by `./build_files/cmake/platform/` instead.
 IF(DEFINED LIBDIR)
-  SET(_moltenvk_SEARCH_DIRS ${_moltenvk_SEARCH_DIRS} ${LIBDIR}/vulkan/MoltenVK)
+  SET(_moltenvk_SEARCH_DIRS ${_moltenvk_SEARCH_DIRS} ${LIBDIR}/moltenvk)
 ENDIF()

 FIND_PATH(MOLTENVK_INCLUDE_DIR
--- a/build_files/cmake/Modules/FindShaderC.cmake
+++ b/build_files/cmake/Modules/FindShaderC.cmake
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2023 Blender Foundation.
+
+# - Find ShaderC libraries
+# Find the ShaderC includes and libraries
+# This module defines
+#  SHADERC_INCLUDE_DIRS, where to find MoltenVK headers, Set when
+#                        SHADERC_INCLUDE_DIR is found.
+#  SHADERC_LIBRARIES, libraries to link against to use ShaderC.
+#  SHADERC_ROOT_DIR, The base directory to search for ShaderC.
+#                    This can also be an environment variable.
+#  SHADERC_FOUND, If false, do not try to use ShaderC.
+#
+
+# If SHADERC_ROOT_DIR was defined in the environment, use it.
+IF(NOT SHADERC_ROOT_DIR AND NOT $ENV{SHADERC_ROOT_DIR} STREQUAL "")
+  SET(SHADERC_ROOT_DIR $ENV{SHADERC_ROOT_DIR})
+ENDIF()
+
+SET(_shaderc_SEARCH_DIRS
+  ${SHADERC_ROOT_DIR}
+)
+
+# FIXME: These finder modules typically don't use LIBDIR,
+# this should be set by `./build_files/cmake/platform/` instead.
+IF(DEFINED LIBDIR)
+  SET(_shaderc_SEARCH_DIRS ${_shaderc_SEARCH_DIRS} ${LIBDIR}/shaderc)
+ENDIF()
+
+FIND_PATH(SHADERC_INCLUDE_DIR
+  NAMES
+    shaderc/shaderc.h
+  HINTS
+    ${_shaderc_SEARCH_DIRS}
+  PATH_SUFFIXES
+    include
+)
+
+FIND_LIBRARY(SHADERC_LIBRARY
+  NAMES
+    shaderc_combined
+  HINTS
+    ${_shaderc_SEARCH_DIRS}
+  PATH_SUFFIXES
+    lib
+)
+
+# handle the QUIETLY and REQUIRED arguments and set SHADERC_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(ShaderC DEFAULT_MSG SHADERC_LIBRARY SHADERC_INCLUDE_DIR)
+
+IF(SHADERC_FOUND)
+  SET(SHADERC_LIBRARIES ${SHADERC_LIBRARY})
+  SET(SHADERC_INCLUDE_DIRS ${SHADERC_INCLUDE_DIR})
+ENDIF()
+
+MARK_AS_ADVANCED(
+  SHADERC_INCLUDE_DIR
+  SHADERC_LIBRARY
+)
+
+UNSET(_shaderc_SEARCH_DIRS)
--- a/build_files/cmake/Modules/FindVulkan.cmake
+++ b/build_files/cmake/Modules/FindVulkan.cmake
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2023 Blender Foundation.
+
+# - Find Vulkan libraries
+# Find the Vulkan includes and libraries
+# This module defines
+#  VULKAN_INCLUDE_DIRS, where to find Vulkan headers, Set when
+#                       VULKAN_INCLUDE_DIR is found.
+#  VULKAN_LIBRARIES, libraries to link against to use Vulkan.
+#  VULKAN_ROOT_DIR, The base directory to search for Vulkan.
+#                    This can also be an environment variable.
+#  VULKAN_FOUND, If false, do not try to use Vulkan.
+#
+
+# If VULKAN_ROOT_DIR was defined in the environment, use it.
+IF(NOT VULKAN_ROOT_DIR AND NOT $ENV{VULKAN_ROOT_DIR} STREQUAL "")
+  SET(VULKAN_ROOT_DIR $ENV{VULKAN_ROOT_DIR})
+ENDIF()
+
+SET(_vulkan_SEARCH_DIRS
+  ${VULKAN_ROOT_DIR}
+)
+
+# FIXME: These finder modules typically don't use LIBDIR,
+# this should be set by `./build_files/cmake/platform/` instead.
+IF(DEFINED LIBDIR)
+  SET(_vulkan_SEARCH_DIRS ${_vulkan_SEARCH_DIRS} ${LIBDIR}/vulkan)
+ENDIF()
+
+FIND_PATH(VULKAN_INCLUDE_DIR
+  NAMES
+    vulkan/vulkan.h
+  HINTS
+    ${_vulkan_SEARCH_DIRS}
+  PATH_SUFFIXES
+    include
+)
+
+FIND_LIBRARY(VULKAN_LIBRARY
+  NAMES
+    vulkan
+  HINTS
+    ${_vulkan_SEARCH_DIRS}
+  PATH_SUFFIXES
+    lib
+)
+
+# handle the QUIETLY and REQUIRED arguments and set VULKAN_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(Vulkan DEFAULT_MSG VULKAN_LIBRARY VULKAN_INCLUDE_DIR)
+
+IF(VULKAN_FOUND)
+  SET(VULKAN_LIBRARIES ${VULKAN_LIBRARY})
+  SET(VULKAN_INCLUDE_DIRS ${VULKAN_INCLUDE_DIR})
+ENDIF()
+
+MARK_AS_ADVANCED(
+  VULKAN_INCLUDE_DIR
+  VULKAN_LIBRARY
+)
+
+UNSET(_vulkan_SEARCH_DIRS)
--- a/build_files/cmake/buildinfo.cmake
+++ b/build_files/cmake/buildinfo.cmake
@@ -23,19 +23,19 @@ if(EXISTS ${SOURCE_DIR}/.git)

  if(MY_WC_BRANCH STREQUAL "HEAD")
    # Detached HEAD, check whether commit hash is reachable
-    # in the master branch
+    # in the main branch
    execute_process(COMMAND git rev-parse --short=12 HEAD
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE MY_WC_HASH
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

-    execute_process(COMMAND git branch --list master blender-v* --contains ${MY_WC_HASH}
+    execute_process(COMMAND git branch --list main blender-v* --contains ${MY_WC_HASH}
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE _git_contains_check
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

    if(NOT _git_contains_check STREQUAL "")
-      set(MY_WC_BRANCH "master")
+      set(MY_WC_BRANCH "main")
    else()
      execute_process(COMMAND git show-ref --tags -d
                      WORKING_DIRECTORY ${SOURCE_DIR}
@@ -48,7 +48,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
                      OUTPUT_STRIP_TRAILING_WHITESPACE)

      if(_git_tag_hashes MATCHES "${_git_head_hash}")
-        set(MY_WC_BRANCH "master")
+        set(MY_WC_BRANCH "main")
      else()
        execute_process(COMMAND git branch --contains ${MY_WC_HASH}
                        WORKING_DIRECTORY ${SOURCE_DIR}
--- a/build_files/cmake/example_scripts/cmake_linux_install.sh
+++ b/build_files/cmake/example_scripts/cmake_linux_install.sh
@@ -11,11 +11,11 @@
 mkdir ~/blender-git
 cd ~/blender-git

-git clone http://git.blender.org/blender.git
+git clone https://projects.blender.org/blender/blender.git
 cd blender
 git submodule update --init --recursive
-git submodule foreach git checkout master
-git submodule foreach git pull --rebase origin master
+git submodule foreach git checkout main
+git submodule foreach git pull --rebase origin main

 # create build dir
 mkdir ~/blender-git/build-cmake
@@ -35,7 +35,7 @@ ln -s ~/blender-git/build-cmake/bin/blender ~/blender-git/blender/blender.bin
 echo ""
 echo "* Useful Commands *"
 echo "   Run Blender: ~/blender-git/blender/blender.bin"
-echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin master"
+echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin main"
 echo "   Reconfigure Blender: cd ~/blender-git/build-cmake ; cmake ."
 echo "   Build Blender: cd ~/blender-git/build-cmake ; make"
 echo ""
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -97,20 +97,8 @@ add_bundled_libraries(materialx/lib)

 if(WITH_VULKAN_BACKEND)
  find_package(MoltenVK REQUIRED)
-
-  if(EXISTS ${LIBDIR}/vulkan)
-    set(VULKAN_FOUND On)
-    set(VULKAN_ROOT_DIR ${LIBDIR}/vulkan/macOS)
-    set(VULKAN_INCLUDE_DIR ${VULKAN_ROOT_DIR}/include)
-    set(VULKAN_LIBRARY ${VULKAN_ROOT_DIR}/lib/libvulkan.1.dylib)
-    set(SHADERC_LIBRARY ${VULKAN_ROOT_DIR}/lib/libshaderc_combined.a)
-
-    set(VULKAN_INCLUDE_DIRS ${VULKAN_INCLUDE_DIR} ${MOLTENVK_INCLUDE_DIRS})
-    set(VULKAN_LIBRARIES ${VULKAN_LIBRARY} ${SHADERC_LIBRARY} ${MOLTENVK_LIBRARIES})
-  else()
-    message(WARNING "Vulkan SDK was not found, disabling WITH_VULKAN_BACKEND")
-    set(WITH_VULKAN_BACKEND OFF)
-  endif()
+  find_package(ShaderC REQUIRED)
+  find_package(Vulkan REQUIRED)
 endif()

 if(WITH_OPENSUBDIV)
--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@@ -111,6 +111,7 @@ find_package_wrapper(Epoxy REQUIRED)

 if(WITH_VULKAN_BACKEND)
  find_package_wrapper(Vulkan REQUIRED)
+  find_package_wrapper(ShaderC REQUIRED)
 endif()

 function(check_freetype_for_brotli)
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -5,16 +5,16 @@
 update-code:
    git:
        submodules:
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/scripts/addons
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/scripts/addons_contrib
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/datafiles/locale
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: source/tools
    svn:
@@ -63,7 +63,7 @@ buildbot:
    optix:
        version: '7.3.0'
    ocloc:
-        version: '101.3430'
+        version: '101.4032'
    cmake:
        default:
            version: any
--- a/build_files/utils/make_bpy_wheel.py
+++ b/build_files/utils/make_bpy_wheel.py
@@ -24,7 +24,7 @@ import os
 import re
 import platform
 import string
-import setuptools  # type: ignore
+import setuptools
 import sys

 from typing import (
@@ -58,7 +58,7 @@ Each Blender release supports one Python version, and the package is only compat
 ## Source Code

 * [Releases](https://download.blender.org/source/)
-* Repository: [git.blender.org/blender.git](https://git.blender.org/gitweb/gitweb.cgi/blender.git)
+* Repository: [projects.blender.org/blender/blender.git](https://projects.blender.org/blender/blender)

 ## Credits

@@ -208,7 +208,7 @@ def main() -> None:
        return paths

    # Ensure this wheel is marked platform specific.
-    class BinaryDistribution(setuptools.dist.Distribution):  # type: ignore
+    class BinaryDistribution(setuptools.dist.Distribution):
        def has_ext_modules(self) -> bool:
            return True

--- a/build_files/utils/make_test.py
+++ b/build_files/utils/make_test.py
@@ -13,10 +13,10 @@ import sys
 import make_utils
 from make_utils import call

-# Parse arguments
+# Parse arguments.


-def parse_arguments():
+def parse_arguments() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("--ctest-command", default="ctest")
    parser.add_argument("--cmake-command", default="cmake")
--- a/build_files/utils/make_update.py
+++ b/build_files/utils/make_update.py
@@ -170,7 +170,7 @@ def git_update_skip(args: argparse.Namespace, check_remote_exists: bool = True)
        return "rebase or merge in progress, complete it first"

    # Abort if uncommitted changes.
-    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no'])
+    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no', '--ignore-submodules'])
    if len(changes) != 0:
        return "you have unstaged changes"

@@ -202,8 +202,8 @@ def submodules_update(
        sys.exit(1)

    # Update submodules to appropriate given branch,
-    # falling back to master if none is given and/or found in a sub-repository.
-    branch_fallback = "master"
+    # falling back to main if none is given and/or found in a sub-repository.
+    branch_fallback = "main"
    if not branch:
        branch = branch_fallback

--- a/build_files/windows/check_submodules.cmd
+++ b/build_files/windows/check_submodules.cmd
@@ -3,9 +3,9 @@ if NOT exist "%BLENDER_DIR%\source\tools\.git" (
 	if not "%GIT%" == "" (
 		"%GIT%" submodule update --init --recursive --progress
 		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git checkout master
+		"%GIT%" submodule foreach git checkout main
 		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git pull --rebase origin master
+		"%GIT%" submodule foreach git pull --rebase origin main
 		if errorlevel 1 goto FAIL
 		goto EOF
 	) else (
--- a/doc/python_api/examples/blf.py
+++ b/doc/python_api/examples/blf.py
@@ -37,7 +37,7 @@ def draw_callback_px(self, context):
    # BLF drawing routine
    font_id = font_info["font_id"]
    blf.position(font_id, 2, 80, 0)
-    blf.size(font_id, 50, 72)
+    blf.size(font_id, 50)
    blf.draw(font_id, "Hello World")


--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -1816,9 +1816,9 @@ def pyrna2sphinx(basepath):

    # operators
    def write_ops():
-        API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts"
-        API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA"
-        API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC"
+        API_BASEURL = "https://projects.blender.org/blender/blender/src/branch/main/release/scripts"
+        API_BASEURL_ADDON = "https://projects.blender.org/blender/blender-addons"
+        API_BASEURL_ADDON_CONTRIB = "https://projects.blender.org/blender/blender-addons-contrib"

        op_modules = {}
        op = None
--- a/doc/python_api/static/js/version_switch.js
+++ b/doc/python_api/static/js/version_switch.js
@@ -156,7 +156,7 @@ var Popover = function() {
    },
    getNamed : function(v) {
      $.each(all_versions, function(ix, title) {
-        if (ix === "master" || ix === "latest") {
+        if (ix === "master" || ix === "main" || ix === "latest") {
          var m = title.match(/\d\.\d[\w\d\.]*/)[0];
          if (parseFloat(m) == v) {
            v = ix;
--- a/extern/hipew/README.blender
+++ b/extern/hipew/README.blender
@@ -1,5 +1,5 @@
 Project: Blender
-URL: https://git.blender.org/blender.git
+URL: https://projects.blender.org/blender/blender.git
 License: Apache 2.0
 Upstream version: N/A
 Local modifications: None
--- a/extern/vulkan_memory_allocator/CMakeLists.txt
+++ b/extern/vulkan_memory_allocator/CMakeLists.txt
@@ -7,6 +7,7 @@ set(INC

 set(INC_SYS
  ${VULKAN_INCLUDE_DIRS}
+  ${MOLTENVK_INCLUDE_DIRS}
 )

 set(SRC
--- a/extern/vulkan_memory_allocator/patches/remove_compilation_warning.diff
+++ b/extern/vulkan_memory_allocator/patches/remove_compilation_warning.diff
@@ -0,0 +1,15 @@
+diff --git a/extern/vulkan_memory_allocator/vk_mem_alloc.h b/extern/vulkan_memory_allocator/vk_mem_alloc.h
+index 60f572038c0..63a9994ba46 100644
+--- a/extern/vulkan_memory_allocator/vk_mem_alloc.h
+++ b/extern/vulkan_memory_allocator/vk_mem_alloc.h
+@@ -13371,8 +13371,8 @@ bool VmaDefragmentationContext_T::IncrementCounters(VkDeviceSize bytes)
+     // Early return when max found
+     if (++m_PassStats.allocationsMoved >= m_MaxPassAllocations || m_PassStats.bytesMoved >= m_MaxPassBytes)
+     {
+-        VMA_ASSERT(m_PassStats.allocationsMoved == m_MaxPassAllocations ||
+-            m_PassStats.bytesMoved == m_MaxPassBytes && "Exceeded maximal pass threshold!");
+        VMA_ASSERT((m_PassStats.allocationsMoved == m_MaxPassAllocations ||
+            m_PassStats.bytesMoved == m_MaxPassBytes) && "Exceeded maximal pass threshold!");
+         return true;
+     }
+     return false;
--- a/extern/vulkan_memory_allocator/vk_mem_alloc.h
+++ b/extern/vulkan_memory_allocator/vk_mem_alloc.h
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -12,6 +12,7 @@ from bpy.props import (
    PointerProperty,
    StringProperty,
 )
+from bpy.app.translations import pgettext_iface as iface_

 from math import pi

@@ -1664,30 +1665,48 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            col.label(text="No compatible GPUs found for Cycles", icon='INFO')

            if device_type == 'CUDA':
-                col.label(text="Requires NVIDIA GPU with compute capability 3.0", icon='BLANK1')
+                compute_capability = "3.0"
+                col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
+                          icon='BLANK1', translate=False)
            elif device_type == 'OPTIX':
-                col.label(text="Requires NVIDIA GPU with compute capability 5.0", icon='BLANK1')
-                col.label(text="and NVIDIA driver version 470 or newer", icon='BLANK1')
+                compute_capability = "5.0"
+                driver_version = "470"
+                col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
+                          icon='BLANK1', translate=False)
+                col.label(text="and NVIDIA driver version %s or newer" % driver_version,
+                          icon='BLANK1', translate=False)
            elif device_type == 'HIP':
                import sys
                if sys.platform[:3] == "win":
-                    col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
-                    col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
+                    driver_version = "21.Q4"
+                    col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
+                    col.label(text=iface_("and AMD Radeon Pro %s driver or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                elif sys.platform.startswith("linux"):
-                    col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
-                    col.label(text="and AMD driver version 22.10 or newer", icon='BLANK1')
+                    driver_version = "22.10"
+                    col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
+                    col.label(text=iface_("and AMD driver version %s or newer") % driver_version, icon='BLANK1',
+                              translate=False)
            elif device_type == 'ONEAPI':
                import sys
                if sys.platform.startswith("win"):
+                    driver_version = "101.4032"
                    col.label(text="Requires Intel GPU with Xe-HPG architecture", icon='BLANK1')
-                    col.label(text="and Windows driver version 101.3430 or newer", icon='BLANK1')
+                    col.label(text=iface_("and Windows driver version %s or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                elif sys.platform.startswith("linux"):
+                    driver_version = "1.3.24931"
                    col.label(text="Requires Intel GPU with Xe-HPG architecture and", icon='BLANK1')
-                    col.label(text="  - intel-level-zero-gpu version 1.3.23904 or newer", icon='BLANK1')
+                    col.label(text=iface_("  - intel-level-zero-gpu version %s or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                    col.label(text="  - oneAPI Level-Zero Loader", icon='BLANK1')
            elif device_type == 'METAL':
-                col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
-                col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1')
+                silicon_mac_version = "12.2"
+                amd_mac_version = "12.3"
+                col.label(text=iface_("Requires Apple Silicon with macOS %s or newer") % silicon_mac_version,
+                          icon='BLANK1', translate=False)
+                col.label(text=iface_("or AMD with macOS %s or newer") % amd_mac_version, icon='BLANK1',
+                          translate=False)
            return

        for device in devices:
@@ -1723,12 +1742,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):

        if compute_device_type == 'METAL':
            import platform
-            # MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
-            # Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
-            if platform.machine() == 'arm64':
+            import re
+            is_navi_2 = False
+            for device in devices:
+                if re.search(r"((RX)|(Pro)|(PRO))\s+W?6\d00X", device.name):
+                    is_navi_2 = True
+                    break
+
+            # MetalRT only works on Apple Silicon and Navi2.
+            is_arm64 = platform.machine() == 'arm64'
+            if is_arm64 or is_navi_2:
                col = layout.column()
                col.use_property_split = True
-                col.prop(self, "kernel_optimization_level")
+                # Kernel specialization is only supported on Apple Silicon
+                if is_arm64:
+                    col.prop(self, "kernel_optimization_level")
                col.prop(self, "use_metalrt")

    def draw(self, context):
--- a/intern/cycles/blender/light.cpp
+++ b/intern/cycles/blender/light.cpp
@@ -48,6 +48,8 @@ void BlenderSync::sync_light(BL::Object &b_parent,
    case BL::Light::type_SPOT: {
      BL::SpotLight b_spot_light(b_light);
      light->set_size(b_spot_light.shadow_soft_size());
+      light->set_axisu(transform_get_column(&tfm, 0));
+      light->set_axisv(transform_get_column(&tfm, 1));
      light->set_light_type(LIGHT_SPOT);
      light->set_spot_angle(b_spot_light.spot_size());
      light->set_spot_smooth(b_spot_light.spot_blend());
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error)
 }

 CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
+  static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
+
  first_error = true;

  cuDevId = info.num;
@@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize CUDA. */
@@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  int value;
+  cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  can_map_host = value != 0;

  cuda_assert(cuDeviceGetAttribute(
      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
 {
  CUDAContextScope scope(this);

-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void CUDADevice::generic_copy_to(device_memory &mem)
+bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  CUDAContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
+  return mem_alloc_result == CUDA_SUCCESS;
 }

-void CUDADevice::generic_free(device_memory &mem)
+void CUDADevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+  CUDAContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
+bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  CUDAContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  CUresult mem_alloc_result = cuMemHostAlloc(
+      &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+  return mem_alloc_result == CUDA_SUCCESS;
+}

-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
+void CUDADevice::free_host(void *shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuMemFreeHost(shared_pointer);
+}
+
+bool CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
+  return true;
+}
+
+void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
 }

 void CUDADevice::mem_alloc(device_memory &mem)
@@ -868,8 +613,8 @@ void CUDADevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const CUDAContextScope scope(this);
    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
  }
@@ -994,19 +739,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
      return;
  }

-  CUDAMem *cmem = NULL;
+  Mem *cmem = NULL;
  CUarray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -1050,10 +795,10 @@ void CUDADevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@@ -1137,8 +882,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@@ -1153,9 +898,9 @@ void CUDADevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@@ -1164,16 +909,16 @@ void CUDADevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      cuArrayDestroy(cmem.array);
+      cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class CUDADevice : public Device {
+class CUDADevice : public GPUDevice {

  friend class CUDAContextScope;

@@ -29,36 +29,11 @@ class CUDADevice : public Device {
  CUdevice cuDevice;
  CUcontext cuContext;
  CUmodule cuModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int cuDevId;
  int cuDevArchitecture;
  bool first_error;

-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  CUDADeviceKernels kernels;

  static bool have_precompiled_kernels();
@@ -88,17 +63,13 @@ class CUDADevice : public Device {

  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -452,6 +452,320 @@ void *Device::get_cpu_osl_memory()
  return nullptr;
 }

+GPUDevice::~GPUDevice() noexcept(false)
+{
+}
+
+bool GPUDevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
+void GPUDevice::init_host_memory(size_t preferred_texture_headroom,
+                                 size_t preferred_working_headroom)
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower than the working one so there
+   * is space left for it. */
+  device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom :
+                                                             32 * 1024 * 1024LL;  // 32MB
+  device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom :
+                                                             128 * 1024 * 1024LL;  // 128MB
+
+  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(device_mem_map_mutex);
+    foreach (MemMap::value_type &pair, device_mem_map) {
+      device_memory &mem = *pair.first;
+      Mem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple backend devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  void *device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  bool mem_alloc_result = false;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  get_device_memory_info(total, free);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    get_device_memory_info(total, free);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = alloc_device(device_pointer, size);
+    if (mem_alloc_result) {
+      device_mem_in_use += size;
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = true;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = alloc_host(shared_pointer, size);
+
+      assert((mem_alloc_result && shared_pointer != 0) ||
+             (!mem_alloc_result && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result) {
+      assert(transform_host_pointer(device_pointer, shared_pointer));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (!mem_alloc_result) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      status = " failed, out of device memory";
+      set_error("System is out of GPU memory");
+    }
+    else {
+      status = " failed, out of device and host memory";
+      set_error("System is out of GPU and shared host memory");
+    }
+  }
+
+  if (mem.name) {
+    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  Mem *cmem = &device_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void GPUDevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          free_host(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      free_device((void *)mem.device_pointer);
+      device_mem_in_use -= mem.device_size;
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    device_mem_map.erase(device_mem_map.find(&mem));
+  }
+}
+
+void GPUDevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should
+   * copy data from mem.host_pointer. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size());
+  }
+}
+
 /* DeviceInfo */

 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -309,6 +309,93 @@ class Device {
  static uint devices_initialized_mask;
 };

+/* Device, which is GPU, with some common functionality for GPU backends */
+class GPUDevice : public Device {
+ protected:
+  GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_),
+        texture_info(this, "texture_info", MEM_GLOBAL),
+        need_texture_info(false),
+        can_map_host(false),
+        map_host_used(0),
+        map_host_limit(0),
+        device_texture_headroom(0),
+        device_working_headroom(0),
+        device_mem_map(),
+        device_mem_map_mutex(),
+        move_texture_to_host(false),
+        device_mem_in_use(0)
+  {
+  }
+
+ public:
+  virtual ~GPUDevice() noexcept(false);
+
+  /* For GPUs that can use bindless textures in some way or another. */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  virtual bool load_texture_info();
+
+ protected:
+  /* Memory allocation, only accessed through device_memory. */
+  friend class device_memory;
+
+  bool can_map_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  typedef unsigned long long texMemObject;
+  typedef unsigned long long arrayMemObject;
+  struct Mem {
+    Mem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    texMemObject texobject;
+    arrayMemObject array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, Mem> MemMap;
+  MemMap device_mem_map;
+  thread_mutex device_mem_map_mutex;
+  bool move_texture_to_host;
+  /* Simple counter which will try to track amount of used device memory */
+  size_t device_mem_in_use;
+
+  virtual void init_host_memory(size_t preferred_texture_headroom = 0,
+                                size_t preferred_working_headroom = 0);
+  virtual void move_textures_to_host(size_t size, bool for_texture);
+
+  /* Allocation, deallocation and copy functions, with corresponding
+   * support of device/host allocations. */
+  virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+  virtual void generic_free(device_memory &mem);
+  virtual void generic_copy_to(device_memory &mem);
+
+  /* total - amount of device memory, free - amount of available device memory */
+  virtual void get_device_memory_info(size_t &total, size_t &free) = 0;
+
+  virtual bool alloc_device(void *&device_pointer, size_t size) = 0;
+
+  virtual void free_device(void *device_pointer) = 0;
+
+  virtual bool alloc_host(void *&shared_pointer, size_t size) = 0;
+
+  virtual void free_host(void *shared_pointer) = 0;
+
+  /* This function should return device pointer corresponding to shared pointer, which
+   * is host buffer, allocated in `alloc_host`. The function should `true`, if such
+   * address transformation is possible and `false` otherwise. */
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0;
+
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0;
+};
+
 CCL_NAMESPACE_END

 #endif /* __DEVICE_H__ */
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -53,8 +53,12 @@ void HIPDevice::set_error(const string &error)
 }

 HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t));
+  static_assert(sizeof(arrayMemObject) == sizeof(hArray));
+
  first_error = true;

  hipDevId = info.num;
@@ -65,12 +69,6 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize HIP. */
@@ -91,7 +89,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* hipDeviceMapHost for mapping host memory when out of device memory.
   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  int value;
+  hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  can_map_host = value != 0;

  hip_assert(
      hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
@@ -460,305 +460,58 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void HIPDevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void HIPDevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    foreach (HIPMemMap::value_type &pair, hip_mem_map) {
-      device_memory &mem = *pair.first;
-      HIPMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple HIP devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void HIPDevice::get_device_memory_info(size_t &total, size_t &free)
 {
  HIPContextScope scope(this);

-  hipDeviceptr_t device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  hipError_t mem_alloc_result = hipErrorOutOfMemory;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  hipMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    hipMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = hipMalloc(&device_pointer, size);
-    if (mem_alloc_result == hipSuccess) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != hipSuccess && can_map_host) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = hipSuccess;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = hipHostMalloc(
-          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
-
-      assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
-             (mem_alloc_result != hipSuccess && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == hipSuccess) {
-      hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != hipSuccess) {
-    status = " failed, out of device and host memory";
-    set_error("System is out of GPU and shared host memory");
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  HIPMem *cmem = &hip_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * HIP memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void HIPDevice::generic_copy_to(device_memory &mem)
+bool HIPDevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  HIPContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const HIPContextScope scope(this);
-    hip_assert(
-        hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size);
+  return mem_alloc_result == hipSuccess;
 }

-void HIPDevice::generic_free(device_memory &mem)
+void HIPDevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+  HIPContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  hip_assert(hipFree((hipDeviceptr_t)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          hipHostFree(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      hip_assert(hipFree(mem.device_pointer));
-    }
+bool HIPDevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  HIPContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  hipError_t mem_alloc_result = hipHostMalloc(
+      &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);

-    hip_mem_map.erase(hip_mem_map.find(&mem));
-  }
+  return mem_alloc_result == hipSuccess;
+}
+
+void HIPDevice::free_host(void *shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hipHostFree(shared_pointer);
+}
+
+bool HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0));
+  return true;
+}
+
+void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const HIPContextScope scope(this);
+
+  hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size));
 }

 void HIPDevice::mem_alloc(device_memory &mem)
@@ -823,8 +576,8 @@ void HIPDevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const HIPContextScope scope(this);
    hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
  }
@@ -951,19 +704,19 @@ void HIPDevice::tex_alloc(device_texture &mem)
      return;
  }

-  HIPMem *cmem = NULL;
+  Mem *cmem = NULL;
  hArray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (hArray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -1007,10 +760,10 @@ void HIPDevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@@ -1095,8 +848,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@@ -1111,9 +864,9 @@ void HIPDevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@@ -1122,16 +875,16 @@ void HIPDevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      hipArrayDestroy(cmem.array);
+      hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class HIPDevice : public Device {
+class HIPDevice : public GPUDevice {

  friend class HIPContextScope;

@@ -26,36 +26,11 @@ class HIPDevice : public Device {
  hipDevice_t hipDevice;
  hipCtx_t hipContext;
  hipModule_t hipModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int hipDevId;
  int hipDevArchitecture;
  bool first_error;

-  struct HIPMem {
-    HIPMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    hipTextureObject_t texobject;
-    hArray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, HIPMem> HIPMemMap;
-  HIPMemMap hip_mem_map;
-  thread_mutex hip_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  HIPDeviceKernels kernels;

  static bool have_precompiled_kernels();
@@ -81,17 +56,13 @@ class HIPDevice : public Device {
  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/hip/util.h
+++ b/intern/cycles/device/hip/util.h
@@ -51,7 +51,7 @@ static inline bool hipSupportsDevice(const int hipDevId)
  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);

-  return (major >= 9);
+  return (major >= 10);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -73,6 +73,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
      return "integrator_terminated_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+      return "integrator_sort_bucket_pass";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
+      return "integrator_sort_write_pass";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
      return "integrator_compact_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -247,6 +247,8 @@ class device_memory {
  bool is_resident(Device *sub_device) const;

 protected:
+  friend class Device;
+  friend class GPUDevice;
  friend class CUDADevice;
  friend class OptiXDevice;
  friend class HIPDevice;
--- a/intern/cycles/device/metal/bvh.h
+++ b/intern/cycles/device/metal/bvh.h
@@ -21,6 +21,7 @@ class BVHMetal : public BVH {

  API_AVAILABLE(macos(11.0))
  vector<id<MTLAccelerationStructure>> blas_array;
+  vector<uint32_t> blas_lookup;

  bool motion_blur = false;

--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress,

    uint32_t instance_index = 0;
    uint32_t motion_transform_index = 0;
+
+    // allocate look up buffer for wost case scenario
+    uint64_t count = objects.size();
+    blas_lookup.resize(count);
+
    for (Object *ob : objects) {
      /* Skip non-traceable objects */
      if (!ob->is_traceable())
@@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress,
      /* Set user instance ID to object index */
      int object_index = ob->get_device_index();
      uint32_t user_id = uint32_t(object_index);
+      int currIndex = instance_index++;
+      assert(user_id < blas_lookup.size());
+      blas_lookup[user_id] = accel_struct_index;

      /* Bake into the appropriate descriptor */
      if (motion_blur) {
        MTLAccelerationStructureMotionInstanceDescriptor *instances =
            (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];

        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
@@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
      else {
        MTLAccelerationStructureUserIDInstanceDescriptor *instances =
            (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];

        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
--- a/intern/cycles/device/metal/device.mm
+++ b/intern/cycles/device/metal/device.mm
@@ -55,6 +55,10 @@ void device_metal_info(vector<DeviceInfo> &devices)
    info.denoisers = DENOISER_NONE;
    info.id = id;

+    if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
+      info.has_light_tree = false;
+    }
+
    devices.push_back(info);
    device_index++;
  }
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -74,6 +74,11 @@ class MetalDevice : public Device {
  id<MTLBuffer> texture_bindings_3d = nil;
  std::vector<id<MTLTexture>> texture_slot_map;

+  /* BLAS encoding & lookup */
+  id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
+  id<MTLBuffer> blas_buffer = nil;
+  id<MTLBuffer> blas_lookup_buffer = nil;
+
  bool use_metalrt = false;
  MetalPipelineType kernel_specialization_level = PSO_GENERIC;

@@ -105,6 +110,8 @@ class MetalDevice : public Device {

  bool use_adaptive_compilation();

+  bool use_local_atomic_sort() const;
+
  bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);

  void make_source(MetalPipelineType pso_type, const uint kernel_features);
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -192,6 +192,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
        arg_desc_as.access = MTLArgumentAccessReadOnly;

+        MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
+        arg_desc_ptrs.dataType = MTLDataTypePointer;
+        arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
+
        MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
        arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
        arg_desc_ift.access = MTLArgumentAccessReadOnly;
@@ -204,14 +208,28 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
        arg_desc_ift.index = index++;
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
+        arg_desc_ift.index = index++;
+        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
+        arg_desc_ptrs.index = index++;
+        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
+        arg_desc_ptrs.index = index++;
+        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */

        [arg_desc_ift release];
        [arg_desc_as release];
+        [arg_desc_ptrs release];
      }
    }

    mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];

+    // preparing the blas arg encoder
+    MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
+    arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
+    arg_desc_blas.access = MTLArgumentAccessReadOnly;
+    mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
+    [arg_desc_blas release];
+
    for (int i = 0; i < ancillary_desc.count; i++) {
      [ancillary_desc[i] release];
    }
@@ -271,6 +289,11 @@ bool MetalDevice::use_adaptive_compilation()
  return DebugFlags().metal.adaptive_compile;
 }

+bool MetalDevice::use_local_atomic_sort() const
+{
+  return DebugFlags().metal.use_local_atomic_sort;
+}
+
 void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
 {
  string global_defines;
@@ -278,6 +301,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }

+  if (use_local_atomic_sort()) {
+    global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
+  }
+
  if (use_metalrt) {
    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
@@ -1231,6 +1258,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    if (@available(macos 11.0, *)) {
      if (bvh->params.top_level) {
        bvhMetalRT = bvh_metal;
+
+        // allocate required buffers for BLAS array
+        uint64_t count = bvhMetalRT->blas_array.size();
+        uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
+        blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
+        stats.mem_alloc(blas_buffer.allocatedSize);
+
+        for (uint64_t i = 0; i < count; ++i) {
+          [mtlBlasArgEncoder setArgumentBuffer:blas_buffer
+                                        offset:i * mtlBlasArgEncoder.encodedLength];
+          [mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
+        }
+
+        count = bvhMetalRT->blas_lookup.size();
+        bufferSize = sizeof(uint32_t) * count;
+        blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
+                                                    options:default_storage_mode];
+        stats.mem_alloc(blas_lookup_buffer.allocatedSize);
+
+        memcpy([blas_lookup_buffer contents],
+               bvhMetalRT -> blas_lookup.data(),
+               blas_lookup_buffer.allocatedSize);
+
+        if (default_storage_mode == MTLResourceStorageModeManaged) {
+          [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
+          [blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
+        }
      }
    }
  }
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -19,6 +19,8 @@ enum {
  METALRT_FUNC_SHADOW_BOX,
  METALRT_FUNC_LOCAL_TRI,
  METALRT_FUNC_LOCAL_BOX,
+  METALRT_FUNC_LOCAL_TRI_PRIM,
+  METALRT_FUNC_LOCAL_BOX_PRIM,
  METALRT_FUNC_CURVE_RIBBON,
  METALRT_FUNC_CURVE_RIBBON_SHADOW,
  METALRT_FUNC_CURVE_ALL,
@@ -28,7 +30,13 @@ enum {
  METALRT_FUNC_NUM
 };

-enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
+enum {
+  METALRT_TABLE_DEFAULT,
+  METALRT_TABLE_SHADOW,
+  METALRT_TABLE_LOCAL,
+  METALRT_TABLE_LOCAL_PRIM,
+  METALRT_TABLE_NUM
+};

 /* Pipeline State Object types */
 enum MetalPipelineType {
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -87,6 +87,9 @@ struct ShaderCache {
          break;
      }
    }
+
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
  }
  ~ShaderCache();

@@ -521,6 +524,8 @@ void MetalKernelPipeline::compile()
          "__anyhit__cycles_metalrt_shadow_all_hit_box",
          "__anyhit__cycles_metalrt_local_hit_tri",
          "__anyhit__cycles_metalrt_local_hit_box",
+          "__anyhit__cycles_metalrt_local_hit_tri_prim",
+          "__anyhit__cycles_metalrt_local_hit_box_prim",
          "__intersection__curve_ribbon",
          "__intersection__curve_ribbon_shadow",
          "__intersection__curve_all",
@@ -611,11 +616,17 @@ void MetalKernelPipeline::compile()
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         nil];
+    table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
+        arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+                         nil];

    NSMutableSet *unique_functions = [NSMutableSet
        setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
+    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];

    if (kernel_has_intersection(device_kernel)) {
      linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -25,6 +25,7 @@ class MetalDeviceQueue : public DeviceQueue {
  virtual int num_concurrent_states(const size_t) const override;
  virtual int num_concurrent_busy_states(const size_t) const override;
  virtual int num_sort_partition_elements() const override;
+  virtual bool supports_local_atomic_sort() const override;

  virtual void init_execution() override;

--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -315,6 +315,11 @@ int MetalDeviceQueue::num_sort_partition_elements() const
  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
 }

+bool MetalDeviceQueue::supports_local_atomic_sort() const
+{
+  return metal_device_->use_local_atomic_sort();
+}
+
 void MetalDeviceQueue::init_execution()
 {
  /* Synchronize all textures and memory copies before executing task. */
@@ -477,6 +482,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (metal_device_->bvhMetalRT) {
        id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
        [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
+        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
+                                                  offset:0
+                                                 atIndex:7];
+        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
+                                                  offset:0
+                                                 atIndex:8];
      }

      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@@ -527,6 +538,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (bvhMetalRT) {
        /* Mark all Accelerations resources as used */
        [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
+                                        usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResource:metal_device_->blas_lookup_buffer
+                                        usage:MTLResourceUsageRead];
        [mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
                                         count:bvhMetalRT->blas_array.size()
                                         usage:MTLResourceUsageRead];
@@ -553,13 +568,24 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      /* See parallel_active_index.h for why this amount of shared memory is needed.
       * Rounded up to 16 bytes for Metal */
      shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
-      [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
      break;

+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+      int key_count = metal_device_->launch_params.data.max_shaders;
+      shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
+      break;
+    }
+
    default:
      break;
  }

+  if (shared_mem_bytes) {
+    assert(shared_mem_bytes <= 32 * 1024);
+    [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
+  }
+
  MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
      divide_up(work_size, num_threads_per_block), 1, 1);
  MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -64,6 +64,12 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
    return METAL_GPU_INTEL;
  }
  else if (strstr(device_name, "AMD")) {
+    /* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */
+    if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) {
+      if (atoi(str)) {
+        return METAL_GPU_UNKNOWN;
+      }
+    }
    return METAL_GPU_AMD;
  }
  else if (strstr(device_name, "Apple")) {
@@ -96,6 +102,15 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    return usable_devices;
  }

+  /* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
+   * one. This can be overridden with CYCLES_METAL_FORCE_INTEL. */
+  bool has_usable_amd_gpu = false;
+  if (@available(macos 12.3, *)) {
+    for (id<MTLDevice> device in MTLCopyAllDevices()) {
+      has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD);
+    }
+  }
+
  metal_printf("Usable Metal devices:\n");
  for (id<MTLDevice> device in MTLCopyAllDevices()) {
    string device_name = get_device_name(device);
@@ -111,8 +126,10 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    }

 #  if defined(MAC_OS_VERSION_13_0)
-    if (@available(macos 13.0, *)) {
-      usable |= (vendor == METAL_GPU_INTEL);
+    if (!has_usable_amd_gpu) {
+      if (@available(macos 13.0, *)) {
+        usable |= (vendor == METAL_GPU_INTEL);
+      }
    }
 #  endif

--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -377,7 +377,7 @@ void OneapiDevice::tex_alloc(device_texture &mem)
  generic_alloc(mem);
  generic_copy_to(mem);

-  /* Resize if needed. Also, in case of resize - allocate in advance for future allocs. */
+  /* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */
  const uint slot = mem.slot;
  if (slot >= texture_info_.size()) {
    texture_info_.resize(slot + 128);
@@ -631,9 +631,9 @@ bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
 /* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
 * since Windows driver 101.3268. */
 /* The same min compute-runtime version is currently required across Windows and Linux.
- * For Windows driver 101.3430, compute-runtime version is 23904. */
-static const int lowest_supported_driver_version_win = 1013430;
-static const int lowest_supported_driver_version_neo = 23904;
+ * For Windows driver 101.4032, compute-runtime version is 24931. */
+static const int lowest_supported_driver_version_win = 1014032;
+static const int lowest_supported_driver_version_neo = 24931;

 int OneapiDevice::parse_driver_build_version(const sycl::device &device)
 {
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -854,12 +854,14 @@ bool OptiXDevice::load_osl_kernels()
        context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
  }

+  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
  vector<OptixStackSizes> osl_stack_size(osl_groups.size());

  /* Update SBT with new entries. */
  sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
  }
  for (size_t i = 0; i < osl_groups.size(); ++i) {
    if (osl_groups[i] != NULL) {
@@ -907,13 +909,15 @@ bool OptiXDevice::load_osl_kernels()
                                     0,
                                     &pipelines[PIP_SHADE]));

+    const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
+                                      stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
    unsigned int dss = 0;
    for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
      dss = std::max(dss, osl_stack_size[i].dssDC);
    }

    optix_assert(optixPipelineSetStackSize(
-        pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
+        pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
  }

  return !have_error();
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -112,6 +112,13 @@ class DeviceQueue {
    return 65536;
  }

+  /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
+   * INTEGRATOR_SORT_WRITE_PASS)? */
+  virtual bool supports_local_atomic_sort() const
+  {
+    return false;
+  }
+
  /* Initialize execution of kernels on this queue.
   *
   * Will, for example, load all data required by the kernels from Device to global or path state.
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -71,6 +71,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
          device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
      integrator_shader_sort_prefix_sum_(
          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
+      integrator_shader_sort_partition_key_offsets_(
+          device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
      integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
      integrator_next_shadow_path_index_(
          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
@@ -207,33 +209,45 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
                                                                num_sort_partitions_);

-  /* Allocate arrays for shader sorting. */
-  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
-  if (integrator_shader_sort_counter_.size() < sort_buckets) {
-    integrator_shader_sort_counter_.alloc(sort_buckets);
-    integrator_shader_sort_counter_.zero_to_device();
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
-        (int *)integrator_shader_sort_counter_.device_pointer;
-
-    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
-    integrator_shader_sort_prefix_sum_.zero_to_device();
-  }
-
-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-    if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
-      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
-      integrator_shader_raytrace_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
-          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    /* Allocate array for partitioned shader sorting using local atomics. */
+    const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
+    if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
+      integrator_shader_sort_partition_key_offsets_.alloc(num_offsets);
+      integrator_shader_sort_partition_key_offsets_.zero_to_device();
    }
+    integrator_state_gpu_.sort_partition_key_offsets =
+        (int *)integrator_shader_sort_partition_key_offsets_.device_pointer;
  }
+  else {
+    /* Allocate arrays for shader sorting. */
+    const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+    if (integrator_shader_sort_counter_.size() < sort_buckets) {
+      integrator_shader_sort_counter_.alloc(sort_buckets);
+      integrator_shader_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+          (int *)integrator_shader_sort_counter_.device_pointer;

-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
-    if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
-      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
-      integrator_shader_mnee_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
-          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
+      integrator_shader_sort_prefix_sum_.zero_to_device();
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
+        integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
+        integrator_shader_raytrace_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+            (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+      }
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+      if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
+        integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+        integrator_shader_mnee_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+            (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      }
    }
  }
 }
@@ -451,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
    work_size = num_queued;
    d_path_index = queued_paths_.device_pointer;

-    compute_sorted_queued_paths(
-        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
+    compute_sorted_queued_paths(kernel, num_paths_limit);
  }
  else if (num_queued < work_size) {
    work_size = num_queued;
@@ -511,11 +524,26 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
  }
 }

-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
-                                                   DeviceKernel queued_kernel,
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
                                                   const int num_paths_limit)
 {
  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    const int work_size = kernel_max_active_main_path_index(queued_kernel);
+    device_ptr d_queued_paths = queued_paths_.device_pointer;
+
+    int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
+
+    DeviceKernelArguments args(
+        &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
+
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
+    return;
+  }
+
  device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
  device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
  assert(d_counter != 0 && d_prefix_sum != 0);
@@ -552,7 +580,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
                               &d_prefix_sum,
                               &d_queued_kernel);

-    queue_->enqueue(kernel, work_size, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, work_size, args);
  }
 }

--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -70,9 +70,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);

  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel,
-                                   DeviceKernel queued_kernel,
-                                   const int num_paths_limit);
+  void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit);

  void compact_main_paths(const int num_active_paths);
  void compact_shadow_paths();
@@ -135,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  device_vector<int> integrator_shader_raytrace_sort_counter_;
  device_vector<int> integrator_shader_mnee_sort_counter_;
  device_vector<int> integrator_shader_sort_prefix_sum_;
+  device_vector<int> integrator_shader_sort_partition_key_offsets_;
  /* Path split. */
  device_vector<int> integrator_next_main_path_index_;
  device_vector<int> integrator_next_shadow_path_index_;
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -170,7 +170,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
    case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
    case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
      label = bsdf_microfacet_ggx_sample(
-          kg, sc, Ng, sd->wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
+          sc, Ng, sd->wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
      break;
    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
@@ -185,7 +185,7 @@ ccl_device_inline int bsdf_sample(KernelGlobals kg,
    case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
    case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
      label = bsdf_microfacet_beckmann_sample(
-          kg, sc, Ng, sd->wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
+          sc, Ng, sd->wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
      break;
    case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
      label = bsdf_ashikhmin_shirley_sample(
@@ -661,4 +661,38 @@ ccl_device void bsdf_blur(KernelGlobals kg, ccl_private ShaderClosure *sc, float
 #endif
 }

+ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd,
+                                       ccl_private const ShaderClosure *sc)
+{
+  Spectrum albedo = sc->weight;
+  /* Some closures include additional components such as Fresnel terms that cause their albedo to
+   * be below 1. The point of this function is to return a best-effort estimation of their albedo,
+   * meaning the amount of reflected/refracted light that would be expected when illuminated by a
+   * uniform white background.
+   * This is used for the denoising albedo pass and diffuse/glossy/transmission color passes.
+   * NOTE: This should always match the sample_weight of the closure - as in, if there's an albedo
+   * adjustment in here, the sample_weight should also be reduced accordingly.
+   * TODO(lukas): Consider calling this function to determine the sample_weight? Would be a bit of
+   * extra overhead though. */
+#if defined(__SVM__) || defined(__OSL__)
+  switch (sc->type) {
+    case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
+    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
+    case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
+      albedo *= microfacet_fresnel((ccl_private const MicrofacetBsdf *)sc, sd->wi, sc->N);
+      break;
+    case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+      albedo *= ((ccl_private const PrincipledSheenBsdf *)sc)->avg_value;
+      break;
+    case CLOSURE_BSDF_HAIR_PRINCIPLED_ID:
+      albedo *= bsdf_principled_hair_albedo(sc);
+      break;
+    default:
+      break;
+  }
+#endif
+  return albedo;
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -41,11 +41,6 @@ static_assert(sizeof(ShaderClosure) >= sizeof(PrincipledHairBSDF),
 static_assert(sizeof(ShaderClosure) >= sizeof(PrincipledHairExtra),
              "PrincipledHairExtra is too large!");

-ccl_device_inline float cos_from_sin(const float s)
-{
-  return safe_sqrtf(1.0f - s * s);
-}
-
 /* Gives the change in direction in the normal plane for the given angles and p-th-order
 * scattering. */
 ccl_device_inline float delta_phi(int p, float gamma_o, float gamma_t)
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -23,8 +23,6 @@ enum MicrofacetType {

 typedef struct MicrofacetExtra {
  Spectrum color, cspec0;
-  Spectrum fresnel_color;
-  float clearcoat;
 } MicrofacetExtra;

 typedef struct MicrofacetBsdf {
@@ -37,190 +35,99 @@ typedef struct MicrofacetBsdf {

 static_assert(sizeof(ShaderClosure) >= sizeof(MicrofacetBsdf), "MicrofacetBsdf is too large!");

-/* Beckmann and GGX microfacet importance sampling. */
-
-ccl_device_inline void microfacet_beckmann_sample_slopes(KernelGlobals kg,
-                                                         const float cos_theta_i,
-                                                         const float sin_theta_i,
-                                                         float randu,
-                                                         float randv,
-                                                         ccl_private float *slope_x,
-                                                         ccl_private float *slope_y,
-                                                         ccl_private float *G1i)
-{
-  /* Special case (normal incidence). */
-  if (cos_theta_i >= 0.99999f) {
-    const float r = sqrtf(-logf(randu));
-    const float phi = M_2PI_F * randv;
-    *slope_x = r * cosf(phi);
-    *slope_y = r * sinf(phi);
-    *G1i = 1.0f;
-
-    return;
-  }
-
-  /* Precomputations. */
-  const float tan_theta_i = sin_theta_i / cos_theta_i;
-  const float inv_a = tan_theta_i;
-  const float cot_theta_i = 1.0f / tan_theta_i;
-  const float erf_a = fast_erff(cot_theta_i);
-  const float exp_a2 = expf(-cot_theta_i * cot_theta_i);
-  const float SQRT_PI_INV = 0.56418958354f;
-  const float Lambda = 0.5f * (erf_a - 1.0f) + (0.5f * SQRT_PI_INV) * (exp_a2 * inv_a);
-  const float G1 = 1.0f / (1.0f + Lambda); /* masking */
-
-  *G1i = G1;
-
-  /* Based on paper from Wenzel Jakob
-   * An Improved Visible Normal Sampling Routine for the Beckmann Distribution
-   *
-   * http://www.mitsuba-renderer.org/~wenzel/files/visnormal.pdf
-   *
-   * Reformulation from OpenShadingLanguage which avoids using inverse
-   * trigonometric functions.
-   */
-
-  /* Sample slope X.
-   *
-   * Compute a coarse approximation using the approximation:
-   *   exp(-ierf(x)^2) ~= 1 - x * x
-   *   solve y = 1 + b + K * (1 - b * b)
-   */
-  const float K = tan_theta_i * SQRT_PI_INV;
-  const float y_approx = randu * (1.0f + erf_a + K * (1 - erf_a * erf_a));
-  const float y_exact = randu * (1.0f + erf_a + K * exp_a2);
-  float b = K > 0 ? (0.5f - sqrtf(K * (K - y_approx + 1.0f) + 0.25f)) / K : y_approx - 1.0f;
-
-  float inv_erf = fast_ierff(b);
-  float2 begin = make_float2(-1.0f, -y_exact);
-  float2 end = make_float2(erf_a, 1.0f + erf_a + K * exp_a2 - y_exact);
-  float2 current = make_float2(b, 1.0f + b + K * expf(-sqr(inv_erf)) - y_exact);
-
-  /* Find root in a monotonic interval using newton method, under given precision and maximal
-   * iterations. Falls back to bisection if newton step produces results outside of the valid
-   * interval.*/
-  const float precision = 1e-6f;
-  const int max_iter = 3;
-  int iter = 0;
-  while (fabsf(current.y) > precision && iter++ < max_iter) {
-    if (signf(begin.y) == signf(current.y)) {
-      begin.x = current.x;
-      begin.y = current.y;
-    }
-    else {
-      end.x = current.x;
-    }
-    const float newton_x = current.x - current.y / (1.0f - inv_erf * tan_theta_i);
-    current.x = (newton_x >= begin.x && newton_x <= end.x) ? newton_x : 0.5f * (begin.x + end.x);
-    inv_erf = fast_ierff(current.x);
-    current.y = 1.0f + current.x + K * expf(-sqr(inv_erf)) - y_exact;
-  }
-
-  *slope_x = inv_erf;
-  *slope_y = fast_ierff(2.0f * randv - 1.0f);
-}
-
-/* GGX microfacet importance sampling from:
- *
+/* Beckmann VNDF importance sampling algorithm from:
 * Importance Sampling Microfacet-Based BSDFs using the Distribution of Visible Normals.
- * E. Heitz and E. d'Eon, EGSR 2014
- */
+ * Eric Heitz and Eugene d'Eon, EGSR 2014.
+ * https://hal.inria.fr/hal-00996995v2/document */

-ccl_device_inline void microfacet_ggx_sample_slopes(const float cos_theta_i,
-                                                    const float sin_theta_i,
-                                                    float randu,
-                                                    float randv,
-                                                    ccl_private float *slope_x,
-                                                    ccl_private float *slope_y,
-                                                    ccl_private float *G1i)
-{
-  /* Special case (normal incidence). */
-  if (cos_theta_i >= 0.99999f) {
-    const float r = sqrtf(randu / (1.0f - randu));
-    const float phi = M_2PI_F * randv;
-    *slope_x = r * cosf(phi);
-    *slope_y = r * sinf(phi);
-    *G1i = 1.0f;
-
-    return;
-  }
-
-  /* Precomputations. */
-  const float tan_theta_i = sin_theta_i / cos_theta_i;
-  const float G1_inv = 0.5f * (1.0f + safe_sqrtf(1.0f + tan_theta_i * tan_theta_i));
-
-  *G1i = 1.0f / G1_inv;
-
-  /* Sample slope_x. */
-  const float A = 2.0f * randu * G1_inv - 1.0f;
-  const float AA = A * A;
-  const float tmp = 1.0f / (AA - 1.0f);
-  const float B = tan_theta_i;
-  const float BB = B * B;
-  const float D = safe_sqrtf(BB * (tmp * tmp) - (AA - BB) * tmp);
-  const float slope_x_1 = B * tmp - D;
-  const float slope_x_2 = B * tmp + D;
-  *slope_x = (A < 0.0f || slope_x_2 * tan_theta_i > 1.0f) ? slope_x_1 : slope_x_2;
-
-  /* Sample slope_y. */
-  float S;
-
-  if (randv > 0.5f) {
-    S = 1.0f;
-    randv = 2.0f * (randv - 0.5f);
-  }
-  else {
-    S = -1.0f;
-    randv = 2.0f * (0.5f - randv);
-  }
-
-  const float z = (randv * (randv * (randv * 0.27385f - 0.73369f) + 0.46341f)) /
-                  (randv * (randv * (randv * 0.093073f + 0.309420f) - 1.000000f) + 0.597999f);
-  *slope_y = S * z * safe_sqrtf(1.0f + (*slope_x) * (*slope_x));
-}
-
-template<MicrofacetType m_type>
-ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals kg,
-                                                          const float3 wi,
-                                                          const float alpha_x,
-                                                          const float alpha_y,
-                                                          const float randu,
-                                                          const float randv,
-                                                          ccl_private float *G1i)
+ccl_device_forceinline float3 microfacet_beckmann_sample_vndf(const float3 wi,
+                                                              const float alpha_x,
+                                                              const float alpha_y,
+                                                              const float randu,
+                                                              const float randv)
 {
  /* 1. stretch wi */
  float3 wi_ = make_float3(alpha_x * wi.x, alpha_y * wi.y, wi.z);
  wi_ = normalize(wi_);

-  /* Compute polar coordinates of wi_. */
-  float costheta_ = 1.0f;
-  float sintheta_ = 0.0f;
-  float cosphi_ = 1.0f;
-  float sinphi_ = 0.0f;
-
-  if (wi_.z < 0.99999f) {
-    costheta_ = wi_.z;
-    sintheta_ = safe_sqrtf(1.0f - costheta_ * costheta_);
-
-    float invlen = 1.0f / sintheta_;
-    cosphi_ = wi_.x * invlen;
-    sinphi_ = wi_.y * invlen;
-  }
-
  /* 2. sample P22_{wi}(x_slope, y_slope, 1, 1) */
  float slope_x, slope_y;
+  float cos_phi_i = 1.0f;
+  float sin_phi_i = 0.0f;

-  if (m_type == MicrofacetType::BECKMANN) {
-    microfacet_beckmann_sample_slopes(
-        kg, costheta_, sintheta_, randu, randv, &slope_x, &slope_y, G1i);
+  if (wi_.z >= 0.99999f) {
+    /* Special case (normal incidence). */
+    const float r = sqrtf(-logf(randu));
+    const float phi = M_2PI_F * randv;
+    slope_x = r * cosf(phi);
+    slope_y = r * sinf(phi);
  }
  else {
-    microfacet_ggx_sample_slopes(costheta_, sintheta_, randu, randv, &slope_x, &slope_y, G1i);
+    /* Precomputations. */
+    const float cos_theta_i = wi_.z;
+    const float sin_theta_i = sin_from_cos(cos_theta_i);
+    const float tan_theta_i = sin_theta_i / cos_theta_i;
+    const float cot_theta_i = 1.0f / tan_theta_i;
+    const float erf_a = fast_erff(cot_theta_i);
+    const float exp_a2 = expf(-cot_theta_i * cot_theta_i);
+    const float SQRT_PI_INV = 0.56418958354f;
+
+    float invlen = 1.0f / sin_theta_i;
+    cos_phi_i = wi_.x * invlen;
+    sin_phi_i = wi_.y * invlen;
+
+    /* Based on paper from Wenzel Jakob
+     * An Improved Visible Normal Sampling Routine for the Beckmann Distribution
+     *
+     * http://www.mitsuba-renderer.org/~wenzel/files/visnormal.pdf
+     *
+     * Reformulation from OpenShadingLanguage which avoids using inverse
+     * trigonometric functions.
+     */
+
+    /* Sample slope X.
+     *
+     * Compute a coarse approximation using the approximation:
+     *   exp(-ierf(x)^2) ~= 1 - x * x
+     *   solve y = 1 + b + K * (1 - b * b)
+     */
+    const float K = tan_theta_i * SQRT_PI_INV;
+    const float y_approx = randu * (1.0f + erf_a + K * (1 - erf_a * erf_a));
+    const float y_exact = randu * (1.0f + erf_a + K * exp_a2);
+    float b = K > 0 ? (0.5f - sqrtf(K * (K - y_approx + 1.0f) + 0.25f)) / K : y_approx - 1.0f;
+
+    float inv_erf = fast_ierff(b);
+    float2 begin = make_float2(-1.0f, -y_exact);
+    float2 end = make_float2(erf_a, 1.0f + erf_a + K * exp_a2 - y_exact);
+    float2 current = make_float2(b, 1.0f + b + K * expf(-sqr(inv_erf)) - y_exact);
+
+    /* Find root in a monotonic interval using newton method, under given precision and maximal
+     * iterations. Falls back to bisection if newton step produces results outside of the valid
+     * interval.*/
+    const float precision = 1e-6f;
+    const int max_iter = 3;
+    int iter = 0;
+    while (fabsf(current.y) > precision && iter++ < max_iter) {
+      if (signf(begin.y) == signf(current.y)) {
+        begin.x = current.x;
+        begin.y = current.y;
+      }
+      else {
+        end.x = current.x;
+      }
+      const float newton_x = current.x - current.y / (1.0f - inv_erf * tan_theta_i);
+      current.x = (newton_x >= begin.x && newton_x <= end.x) ? newton_x : 0.5f * (begin.x + end.x);
+      inv_erf = fast_ierff(current.x);
+      current.y = 1.0f + current.x + K * expf(-sqr(inv_erf)) - y_exact;
+    }
+
+    slope_x = inv_erf;
+    slope_y = fast_ierff(2.0f * randv - 1.0f);
  }

  /* 3. rotate */
-  float tmp = cosphi_ * slope_x - sinphi_ * slope_y;
-  slope_y = sinphi_ * slope_x + cosphi_ * slope_y;
+  float tmp = cos_phi_i * slope_x - sin_phi_i * slope_y;
+  slope_y = sin_phi_i * slope_x + cos_phi_i * slope_y;
  slope_x = tmp;

  /* 4. unstretch */
@@ -231,6 +138,43 @@ ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals kg,
  return normalize(make_float3(-slope_x, -slope_y, 1.0f));
 }

+/* GGX VNDF importance sampling algorithm from:
+ * Sampling the GGX Distribution of Visible Normals.
+ * Eric Heitz, JCGT Vol. 7, No. 4, 2018.
+ * https://jcgt.org/published/0007/04/01/ */
+ccl_device_forceinline float3 microfacet_ggx_sample_vndf(const float3 wi,
+                                                         const float alpha_x,
+                                                         const float alpha_y,
+                                                         const float randu,
+                                                         const float randv)
+{
+  /* Section 3.2: Transforming the view direction to the hemisphere configuration. */
+  float3 wi_ = normalize(make_float3(alpha_x * wi.x, alpha_y * wi.y, wi.z));
+
+  /* Section 4.1: Orthonormal basis. */
+  float lensq = sqr(wi_.x) + sqr(wi_.y);
+  float3 T1, T2;
+  if (lensq > 1e-7f) {
+    T1 = make_float3(-wi_.y, wi_.x, 0.0f) * inversesqrtf(lensq);
+    T2 = cross(wi_, T1);
+  }
+  else {
+    /* Normal incidence, any basis is fine. */
+    T1 = make_float3(1.0f, 0.0f, 0.0f);
+    T2 = make_float3(0.0f, 1.0f, 0.0f);
+  }
+
+  /* Section 4.2: Parameterization of the projected area. */
+  float2 t = concentric_sample_disk(randu, randv);
+  t.y = mix(safe_sqrtf(1.0f - sqr(t.x)), t.y, 0.5f * (1.0f + wi_.z));
+
+  /* Section 4.3: Reprojection onto hemisphere. */
+  float3 H_ = t.x * T1 + t.y * T2 + safe_sqrtf(1.0f - len_squared(t)) * wi_;
+
+  /* Section 3.4: Transforming the normal back to the ellipsoid configuration. */
+  return normalize(make_float3(alpha_x * H_.x, alpha_y * H_.y, max(0.0f, H_.z)));
+}
+
 /* Calculate the reflection color
 *
 * If fresnel is used, the color is an interpolation of the F0 color and white
@@ -238,26 +182,25 @@ ccl_device_forceinline float3 microfacet_sample_stretched(KernelGlobals kg,
 *
 * Else it is simply white
 */
-ccl_device_forceinline Spectrum reflection_color(ccl_private const MicrofacetBsdf *bsdf,
-                                                 float3 L,
-                                                 float3 H)
+ccl_device_forceinline Spectrum microfacet_fresnel(ccl_private const MicrofacetBsdf *bsdf,
+                                                   float3 wi,
+                                                   float3 H)
 {
-  Spectrum F = one_spectrum();
-
-  bool use_clearcoat = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
-  bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || use_clearcoat);
-
-  if (use_fresnel) {
-    float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
-
-    F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0);
+  if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(bsdf->type)) {
+    return interpolate_fresnel_color(wi, H, bsdf->ior, bsdf->extra->cspec0);
  }
-
-  if (use_clearcoat) {
-    F *= 0.25f * bsdf->extra->clearcoat;
+  else if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
+    return make_spectrum(fresnel_dielectric_cos(dot(wi, H), bsdf->ior));
  }
+  else {
+    return one_spectrum();
+  }
+}

-  return F;
+ccl_device_forceinline void bsdf_microfacet_adjust_weight(ccl_private const ShaderData *sd,
+                                                          ccl_private MicrofacetBsdf *bsdf)
+{
+  bsdf->sample_weight *= average(microfacet_fresnel(bsdf, sd->wi, bsdf->N));
 }

 /* Generalized Trowbridge-Reitz for clearcoat. */
@@ -271,37 +214,48 @@ ccl_device_forceinline float bsdf_clearcoat_D(float alpha2, float cos_NH)
  return (alpha2 - 1.0f) / (M_PI_F * logf(alpha2) * t);
 }

-/* Monodirectional shadowing-masking term. */
+/* Smith shadowing-masking term, here in the non-separable form.
+ * For details, see:
+ * Understanding the Masking-Shadowing Function in Microfacet-Based BRDFs.
+ * Eric Heitz, JCGT Vol. 3, No. 2, 2014.
+ * https://jcgt.org/published/0003/02/03/ */
 template<MicrofacetType m_type>
-ccl_device_inline float bsdf_G1_from_sqr_alpha_tan_n(float sqr_alpha_tan_n)
+ccl_device_inline float bsdf_lambda_from_sqr_alpha_tan_n(float sqr_alpha_tan_n)
 {
  if (m_type == MicrofacetType::GGX) {
-    return 2.0f / (1.0f + sqrtf(1.0f + sqr_alpha_tan_n));
+    /* Equation 72. */
+    return 0.5f * (sqrtf(1.0f + sqr_alpha_tan_n) - 1.0f);
  }
  else {
-    /* m_type == MicrofacetType::BECKMANN */
+    /* m_type == MicrofacetType::BECKMANN
+     * Approximation from below Equation 69. */
+    if (sqr_alpha_tan_n < 0.39f) {
+      /* Equivalent to a >= 1.6f, but also handles sqr_alpha_tan_n == 0.0f cleanly. */
+      return 0.0f;
+    }
+
    const float a = inversesqrtf(sqr_alpha_tan_n);
-    return (a > 1.6f) ? 1.0f : ((2.181f * a + 3.535f) * a) / ((2.577f * a + 2.276f) * a + 1.0f);
+    return ((0.396f * a - 1.259f) * a + 1.0f) / ((2.181f * a + 3.535f) * a);
  }
 }

-template<MicrofacetType m_type> ccl_device_inline float bsdf_G1(float alpha2, float cos_N)
+template<MicrofacetType m_type> ccl_device_inline float bsdf_lambda(float alpha2, float cos_N)
 {
-  return bsdf_G1_from_sqr_alpha_tan_n<m_type>(alpha2 * fmaxf(1.0f / (cos_N * cos_N) - 1.0f, 0.0f));
+  return bsdf_lambda_from_sqr_alpha_tan_n<m_type>(alpha2 * fmaxf(1.0f / sqr(cos_N) - 1.0f, 0.0f));
 }

 template<MicrofacetType m_type>
-ccl_device_inline float bsdf_aniso_G1(float alpha_x, float alpha_y, float3 V)
+ccl_device_inline float bsdf_aniso_lambda(float alpha_x, float alpha_y, float3 V)
 {
-  return bsdf_G1_from_sqr_alpha_tan_n<m_type>((sqr(alpha_x * V.x) + sqr(alpha_y * V.y)) /
-                                              sqr(V.z));
+  const float sqr_alpha_tan_n = (sqr(alpha_x * V.x) + sqr(alpha_y * V.y)) / sqr(V.z);
+  return bsdf_lambda_from_sqr_alpha_tan_n<m_type>(sqr_alpha_tan_n);
 }

-/* Smith's separable shadowing-masking term. */
+/* Combined shadowing-masking term. */
 template<MicrofacetType m_type>
 ccl_device_inline float bsdf_G(float alpha2, float cos_NI, float cos_NO)
 {
-  return bsdf_G1<m_type>(alpha2, cos_NI) * bsdf_G1<m_type>(alpha2, cos_NO);
+  return 1.0f / (1.0f + bsdf_lambda<m_type>(alpha2, cos_NI) + bsdf_lambda<m_type>(alpha2, cos_NO));
 }

 /* Normal distribution function. */
@@ -335,22 +289,6 @@ ccl_device_inline float bsdf_aniso_D(float alpha_x, float alpha_y, float3 H)
  }
 }

-ccl_device_forceinline void bsdf_microfacet_fresnel_color(ccl_private const ShaderData *sd,
-                                                          ccl_private MicrofacetBsdf *bsdf)
-{
-  kernel_assert(CLOSURE_IS_BSDF_MICROFACET_FRESNEL(bsdf->type));
-
-  float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
-  bsdf->extra->fresnel_color = interpolate_fresnel_color(
-      sd->wi, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0);
-
-  if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
-    bsdf->extra->fresnel_color *= 0.25f * bsdf->extra->clearcoat;
-  }
-
-  bsdf->sample_weight *= average(bsdf->extra->fresnel_color);
-}
-
 template<MicrofacetType m_type>
 ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,
                                         const float3 Ng,
@@ -382,7 +320,7 @@ ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,
  H *= inv_len_H;

  const float cos_NH = dot(N, H);
-  float D, G1i, G1o;
+  float D, lambdaI, lambdaO;

  /* TODO: add support for anisotropic transmission. */
  if (alpha_x == alpha_y || m_refractive) { /* Isotropic. */
@@ -399,8 +337,8 @@ ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,
      D = bsdf_D<m_type>(alpha2, cos_NH);
    }

-    G1i = bsdf_G1<m_type>(alpha2, cos_NI);
-    G1o = bsdf_G1<m_type>(alpha2, cos_NO);
+    lambdaI = bsdf_lambda<m_type>(alpha2, cos_NI);
+    lambdaO = bsdf_lambda<m_type>(alpha2, cos_NO);
  }
  else { /* Anisotropic. */
    float3 X, Y;
@@ -412,25 +350,23 @@ ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,

    D = bsdf_aniso_D<m_type>(alpha_x, alpha_y, local_H);

-    G1i = bsdf_aniso_G1<m_type>(alpha_x, alpha_y, local_I);
-    G1o = bsdf_aniso_G1<m_type>(alpha_x, alpha_y, local_O);
+    lambdaI = bsdf_aniso_lambda<m_type>(alpha_x, alpha_y, local_I);
+    lambdaO = bsdf_aniso_lambda<m_type>(alpha_x, alpha_y, local_O);
  }

-  const float common = G1i * D / cos_NI *
+  const float common = D / cos_NI *
                       (m_refractive ?
                            sqr(bsdf->ior * inv_len_H) * fabsf(dot(H, wi) * dot(H, wo)) :
                            0.25f);

-  *pdf = common;
+  *pdf = common / (1.0f + lambdaI);

-  const Spectrum F = m_refractive ? one_spectrum() : reflection_color(bsdf, wo, H);
-
-  return F * G1o * common;
+  const Spectrum F = microfacet_fresnel(bsdf, wo, H);
+  return F * common / (1.0f + lambdaO + lambdaI);
 }

 template<MicrofacetType m_type>
-ccl_device int bsdf_microfacet_sample(KernelGlobals kg,
-                                      ccl_private const ShaderClosure *sc,
+ccl_device int bsdf_microfacet_sample(ccl_private const ShaderClosure *sc,
                                      float3 Ng,
                                      float3 wi,
                                      float randu,
@@ -466,10 +402,15 @@ ccl_device int bsdf_microfacet_sample(KernelGlobals kg,

  /* Importance sampling with distribution of visible normals. Vectors are transformed to local
   * space before and after sampling. */
-  float G1i;
  const float3 local_I = make_float3(dot(X, wi), dot(Y, wi), cos_NI);
-  const float3 local_H = microfacet_sample_stretched<m_type>(
-      kg, local_I, alpha_x, alpha_y, randu, randv, &G1i);
+  float3 local_H;
+  if (m_type == MicrofacetType::GGX) {
+    local_H = microfacet_ggx_sample_vndf(local_I, alpha_x, alpha_y, randu, randv);
+  }
+  else {
+    /* m_type == MicrofacetType::BECKMANN */
+    local_H = microfacet_beckmann_sample_vndf(local_I, alpha_x, alpha_y, randu, randv);
+  }

  const float3 H = X * local_H.x + Y * local_H.y + N * local_H.z;
  const float cos_NH = local_H.z;
@@ -502,19 +443,12 @@ ccl_device int bsdf_microfacet_sample(KernelGlobals kg,
    label |= LABEL_SINGULAR;
    /* Some high number for MIS. */
    *pdf = 1e6f;
-    *eval = make_spectrum(1e6f);
-
-    bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID ||
-                        bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
-
-    if (use_fresnel && !m_refractive) {
-      *eval *= reflection_color(bsdf, *wo, H);
-    }
+    *eval = make_spectrum(1e6f) * microfacet_fresnel(bsdf, *wo, H);
  }
  else {
    label |= LABEL_GLOSSY;
    float cos_NO = dot(N, *wo);
-    float D, G1o;
+    float D, lambdaI, lambdaO;

    /* TODO: add support for anisotropic transmission. */
    if (alpha_x == alpha_y || m_refractive) { /* Isotropic. */
@@ -526,34 +460,32 @@ ccl_device int bsdf_microfacet_sample(KernelGlobals kg,
        /* The masking-shadowing term for clearcoat has a fixed alpha of 0.25
         * => alpha2 = 0.25 * 0.25 */
        alpha2 = 0.0625f;
-
-        /* Recalculate G1i. */
-        G1i = bsdf_G1<m_type>(alpha2, cos_NI);
      }
      else {
        D = bsdf_D<m_type>(alpha2, cos_NH);
      }

-      G1o = bsdf_G1<m_type>(alpha2, cos_NO);
+      lambdaO = bsdf_lambda<m_type>(alpha2, cos_NO);
+      lambdaI = bsdf_lambda<m_type>(alpha2, cos_NI);
    }
    else { /* Anisotropic. */
      const float3 local_O = make_float3(dot(X, *wo), dot(Y, *wo), cos_NO);

      D = bsdf_aniso_D<m_type>(alpha_x, alpha_y, local_H);

-      G1o = bsdf_aniso_G1<m_type>(alpha_x, alpha_y, local_O);
+      lambdaO = bsdf_aniso_lambda<m_type>(alpha_x, alpha_y, local_O);
+      lambdaI = bsdf_aniso_lambda<m_type>(alpha_x, alpha_y, local_I);
    }

    const float cos_HO = dot(H, *wo);
-    const float common = G1i * D / cos_NI *
+    const float common = D / cos_NI *
                         (m_refractive ? fabsf(cos_HI * cos_HO) / sqr(cos_HO + cos_HI / m_eta) :
                                         0.25f);

-    *pdf = common;
+    *pdf = common / (1.0f + lambdaI);

-    Spectrum F = m_refractive ? one_spectrum() : reflection_color(bsdf, *wo, H);
-
-    *eval = G1o * common * F;
+    Spectrum F = microfacet_fresnel(bsdf, *wo, H);
+    *eval = F * common / (1.0f + lambdaI + lambdaO);
  }

  *sampled_roughness = make_float2(alpha_x, alpha_y);
@@ -587,14 +519,6 @@ ccl_device int bsdf_microfacet_ggx_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }

-/* Required to maintain OSL interface. */
-ccl_device int bsdf_microfacet_ggx_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  return bsdf_microfacet_ggx_setup(bsdf);
-}
-
 ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
                                                 ccl_private const ShaderData *sd)
 {
@@ -605,7 +529,7 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsd

  bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID;

-  bsdf_microfacet_fresnel_color(sd, bsdf);
+  bsdf_microfacet_adjust_weight(sd, bsdf);

  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
@@ -613,14 +537,12 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsd
 ccl_device int bsdf_microfacet_ggx_clearcoat_setup(ccl_private MicrofacetBsdf *bsdf,
                                                   ccl_private const ShaderData *sd)
 {
-  bsdf->extra->cspec0 = saturate(bsdf->extra->cspec0);
-
  bsdf->alpha_x = saturatef(bsdf->alpha_x);
  bsdf->alpha_y = bsdf->alpha_x;

  bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;

-  bsdf_microfacet_fresnel_color(sd, bsdf);
+  bsdf_microfacet_adjust_weight(sd, bsdf);

  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
@@ -654,8 +576,7 @@ ccl_device Spectrum bsdf_microfacet_ggx_eval(ccl_private const ShaderClosure *sc
  return bsdf_microfacet_eval<MicrofacetType::GGX>(sc, Ng, wi, wo, pdf);
 }

-ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals kg,
-                                          ccl_private const ShaderClosure *sc,
+ccl_device int bsdf_microfacet_ggx_sample(ccl_private const ShaderClosure *sc,
                                          float3 Ng,
                                          float3 wi,
                                          float randu,
@@ -667,7 +588,7 @@ ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals kg,
                                          ccl_private float *eta)
 {
  return bsdf_microfacet_sample<MicrofacetType::GGX>(
-      kg, sc, Ng, wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
+      sc, Ng, wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
 }

 /* Beckmann microfacet with Smith shadow-masking from:
@@ -684,14 +605,6 @@ ccl_device int bsdf_microfacet_beckmann_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }

-/* Required to maintain OSL interface. */
-ccl_device int bsdf_microfacet_beckmann_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  return bsdf_microfacet_beckmann_setup(bsdf);
-}
-
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ccl_private MicrofacetBsdf *bsdf)
 {
  bsdf->alpha_x = saturatef(bsdf->alpha_x);
@@ -718,8 +631,7 @@ ccl_device Spectrum bsdf_microfacet_beckmann_eval(ccl_private const ShaderClosur
  return bsdf_microfacet_eval<MicrofacetType::BECKMANN>(sc, Ng, wi, wo, pdf);
 }

-ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals kg,
-                                               ccl_private const ShaderClosure *sc,
+ccl_device int bsdf_microfacet_beckmann_sample(ccl_private const ShaderClosure *sc,
                                               float3 Ng,
                                               float3 wi,
                                               float randu,
@@ -731,7 +643,7 @@ ccl_device int bsdf_microfacet_beckmann_sample(KernelGlobals kg,
                                               ccl_private float *eta)
 {
  return bsdf_microfacet_sample<MicrofacetType::BECKMANN>(
-      kg, sc, Ng, wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
+      sc, Ng, wi, randu, randv, eval, wo, pdf, sampled_roughness, eta);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -43,7 +43,7 @@ ccl_device_forceinline float2 mf_sampleP22_11(const float cosI,
    return make_float2(r * cosf(phi), r * sinf(phi));
  }

-  const float sinI = safe_sqrtf(1.0f - cosI * cosI);
+  const float sinI = sin_from_cos(cosI);
  const float tanI = sinI / cosI;
  const float projA = 0.5f * (cosI + 1.0f);
  if (projA < 0.0001f)
@@ -401,7 +401,7 @@ ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(ccl_private MicrofacetBsd

  bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;

-  bsdf_microfacet_fresnel_color(sd, bsdf);
+  bsdf_microfacet_adjust_weight(sd, bsdf);

  return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
@@ -575,7 +575,7 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(ccl_private Microfa

  bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;

-  bsdf_microfacet_fresnel_color(sd, bsdf);
+  bsdf_microfacet_adjust_weight(sd, bsdf);

  return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
 }
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -73,9 +73,8 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
  eval = make_spectrum(val);
 #endif

-  float F0 = fresnel_dielectric_cos(1.0f, eta);
  if (use_fresnel) {
-    throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0);
+    throughput = interpolate_fresnel_color(wi, wh, eta, cspec0);

    eval *= throughput;
  }
@@ -144,11 +143,11 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
        throughput *= color;
      }
      else if (use_fresnel && order > 0) {
-        throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+        throughput *= interpolate_fresnel_color(wi_prev, wm, eta, cspec0);
      }
 #else /* MF_MULTI_GLOSSY */
      if (use_fresnel && order > 0) {
-        throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+        throughput *= interpolate_fresnel_color(-wr, wm, eta, cspec0);
      }
      wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
@@ -192,8 +191,6 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
  float G1_r = 0.0f;
  bool outside = true;

-  float F0 = fresnel_dielectric_cos(1.0f, eta);
-
  int order;
  for (order = 0; order < 10; order++) {
    /* Sample microfacet height. */
@@ -229,22 +226,12 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
        throughput *= color;
      }
      else {
-        Spectrum t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
-
-        if (order == 0)
-          throughput = t_color;
-        else
-          throughput *= t_color;
+        throughput *= interpolate_fresnel_color(wi_prev, wm, eta, cspec0);
      }
    }
 #else /* MF_MULTI_GLOSSY */
    if (use_fresnel) {
-      Spectrum t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
-
-      if (order == 0)
-        throughput = t_color;
-      else
-        throughput *= t_color;
+      throughput *= interpolate_fresnel_color(-wr, wm, eta, cspec0);
    }
    wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -89,19 +89,21 @@ ccl_device float schlick_fresnel(float u)
  return m2 * m2 * m;  // pow(m, 5)
 }

-/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
-ccl_device_forceinline Spectrum
-interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, Spectrum cspec0)
+/* Calculate the fresnel color, which is a blend between white and the F0 color */
+ccl_device_forceinline Spectrum interpolate_fresnel_color(float3 L,
+                                                          float3 H,
+                                                          float ior,
+                                                          Spectrum F0)
 {
-  /* Calculate the fresnel interpolation factor
-   * The value from fresnel_dielectric_cos(...) has to be normalized because
-   * the cspec0 keeps the F0 color
-   */
-  float F0_norm = 1.0f / (1.0f - F0);
-  float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
+  /* Compute the real Fresnel term and remap it from real_F0..1 to F0..1.
+   * The reason why we use this remapping instead of directly doing the
+   * Schlick approximation lerp(F0, 1.0, (1.0-cosLH)^5) is that for cases
+   * with similar IORs (e.g. ice in water), the relative IOR can be close
+   * enough to 1.0 that the Schlick approximation becomes inaccurate. */
+  float real_F = fresnel_dielectric_cos(dot(L, H), ior);
+  float real_F0 = fresnel_dielectric_cos(1.0f, ior);

-  /* Blend between white and a specular color with respect to the fresnel */
-  return cspec0 * (1.0f - FH) + make_spectrum(FH);
+  return mix(F0, one_spectrum(), inverse_lerp(real_F0, 1.0f, real_F));
 }

 ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
--- a/intern/cycles/kernel/closure/volume.h
+++ b/intern/cycles/kernel/closure/volume.h
@@ -88,7 +88,7 @@ henyey_greenstrein_sample(float3 D, float g, float randu, float randv, ccl_priva
    }
  }

-  float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
+  float sin_theta = sin_from_cos(cos_theta);
  float phi = M_2PI_F * randv;
  float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);

--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -401,6 +401,72 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
 }
 ccl_gpu_kernel_postfix

+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_bucket_pass(num_states,
+                                partition_size,
+                                max_shaders,
+                                kernel_index,
+                                d_queued_kernel,
+                                d_shader_sort_key,
+                                key_offsets,
+                                (threadgroup int *)threadgroup_array,
+                                metal_local_id,
+                                metal_local_size,
+                                metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_write_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_write_pass(num_states,
+                               partition_size,
+                               max_shaders,
+                               kernel_index,
+                               num_states_limit,
+                               indices,
+                               d_queued_kernel,
+                               d_shader_sort_key,
+                               key_offsets,
+                               (threadgroup int *)threadgroup_array,
+                               metal_local_id,
+                               metal_local_size,
+                               metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
 ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_paths_array,
                             int num_states,
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -178,7 +178,7 @@ __device__
                                         simd_lane_index, \
                                         simd_group_index, \
                                         num_simd_groups, \
-                                         simdgroup_offset)
+                                         (threadgroup int *)threadgroup_array)
 #elif defined(__KERNEL_ONEAPI__)

 #  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -19,6 +19,115 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
+
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+
+#  define atomic_store_local(p, x) \
+    atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
+#  define atomic_load_local(p) \
+    atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
+
+ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
+                                                     const uint partition_size,
+                                                     const uint max_shaders,
+                                                     const uint queued_kernel,
+                                                     ccl_global ushort *d_queued_kernel,
+                                                     ccl_global uint *d_shader_sort_key,
+                                                     ccl_global int *partition_key_offsets,
+                                                     ccl_gpu_shared int *buckets,
+                                                     const ushort local_id,
+                                                     const ushort local_size,
+                                                     const ushort grid_id)
+{
+  /* Zero the bucket sizes. */
+  if (local_id < max_shaders) {
+    atomic_store_local(&buckets[local_id], 0);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Determine bucket sizes within the partitions. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      atomic_fetch_and_add_uint32(&buckets[key], 1);
+    }
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
+
+  if (local_id == 0) {
+    int offset = 0;
+    for (int i = 0; i < max_shaders; i++) {
+      partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
+      offset = offset + atomic_load_local(&buckets[i]);
+    }
+
+    /* Store the number of active states in this partition. */
+    partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
+  }
+}
+
+ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
+                                                    const uint partition_size,
+                                                    const uint max_shaders,
+                                                    const uint queued_kernel,
+                                                    const int num_states_limit,
+                                                    ccl_global int *indices,
+                                                    ccl_global ushort *d_queued_kernel,
+                                                    ccl_global uint *d_shader_sort_key,
+                                                    ccl_global int *partition_key_offsets,
+                                                    ccl_gpu_shared int *local_offset,
+                                                    const ushort local_id,
+                                                    const ushort local_size,
+                                                    const ushort grid_id)
+{
+  /* Calculate each partition's global offset from the prefix sum of the active state counts per
+   * partition. */
+
+  if (local_id < max_shaders) {
+    int partition_offset = 0;
+    for (int i = 0; i < uint(grid_id); i++) {
+      int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
+      partition_offset += partition_key_count;
+    }
+
+    ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
+    atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write the sorted active indices. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
+      if (index < num_states_limit) {
+        indices[index] = state_index;
+      }
+    }
+  }
+}
+
+#endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */

 template<typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint state_index,
--- a/intern/cycles/kernel/device/metal/bvh.h
+++ b/intern/cycles/kernel/device/metal/bvh.h
@@ -172,17 +172,14 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
    kernel_assert(!"Invalid ift_local");
    return false;
  }
-#  endif
-
-  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
-  metalrt_intersector_type metalrt_intersect;
-
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-
-  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
-  if (triangle_only) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  if (is_null_intersection_function_table(metal_ancillaries->ift_local_prim)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid ift_local_prim");
+    return false;
  }
+#  endif

  MetalRTIntersectionLocalPayload payload;
  payload.self = ray->self;
@@ -195,14 +192,48 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
  }
  payload.result = false;

-  typename metalrt_intersector_type::result_type intersection;
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);

 #  if defined(__METALRT_MOTION__)
+  metalrt_intersector_type metalrt_intersect;
+  typename metalrt_intersector_type::result_type intersection;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
+  if (triangle_only) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
  intersection = metalrt_intersect.intersect(
      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
 #  else
+
+  metalrt_blas_intersector_type metalrt_intersect;
+  typename metalrt_blas_intersector_type::result_type intersection;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
+  if (triangle_only) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  // if we know we are going to get max one hit, like for random-sss-walk we can
+  // optimize and accept the first hit
+  if (max_hits == 1) {
+    metalrt_intersect.accept_any_intersection(true);
+  }
+
+  int blas_index = metal_ancillaries->blas_userID_to_index_lookUp[local_object];
+  // transform the ray into object's local space
+  Transform itfm = kernel_data_fetch(objects, local_object).itfm;
+  r.origin = transform_point(&itfm, r.origin);
+  r.direction = transform_direction(&itfm, r.direction);
+
  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
+      r,
+      metal_ancillaries->blas_accel_structs[blas_index].blas,
+      metal_ancillaries->ift_local_prim,
+      payload);
 #  endif

  if (lcg_state) {
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -105,10 +105,11 @@ struct kernel_gpu_##name \
 { \
  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
  void run(thread MetalKernelContext& context, \
-           threadgroup int *simdgroup_offset, \
+           threadgroup atomic_int *threadgroup_array, \
           const uint metal_global_id, \
           const ushort metal_local_id, \
           const ushort metal_local_size, \
+           const ushort metal_grid_id, \
           uint simdgroup_size, \
           uint simd_lane_index, \
           uint simd_group_index, \
@@ -117,22 +118,24 @@ struct kernel_gpu_##name \
 kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
                                constant MetalAncillaries *_metal_ancillaries, \
-                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]], \
                                const uint metal_global_id [[thread_position_in_grid]], \
                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                const ushort metal_grid_id    [[threadgroup_position_in_grid]], \
                                uint simdgroup_size [[threads_per_simdgroup]], \
                                uint simd_lane_index [[thread_index_in_simdgroup]], \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+  params_struct->run(context, threadgroup_array, metal_global_id, metal_local_id, metal_local_size, metal_grid_id, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
 } \
 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
-                  threadgroup int *simdgroup_offset, \
+                  threadgroup atomic_int *threadgroup_array, \
                  const uint metal_global_id, \
                  const ushort metal_local_id, \
                  const ushort metal_local_size, \
+                  const ushort metal_grid_id, \
                  uint simdgroup_size, \
                  uint simd_lane_index, \
                  uint simd_group_index, \
@@ -263,13 +266,25 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,

 #  if defined(__METALRT_MOTION__)
 #    define METALRT_TAGS instancing, instance_motion, primitive_motion
+#    define METALRT_BLAS_TAGS , primitive_motion
 #  else
 #    define METALRT_TAGS instancing
+#    define METALRT_BLAS_TAGS
 #  endif /* __METALRT_MOTION__ */

 typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
 typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
 typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
+#  if defined(__METALRT_MOTION__)
+typedef acceleration_structure<primitive_motion> metalrt_blas_as_type;
+typedef intersection_function_table<triangle_data, primitive_motion> metalrt_blas_ift_type;
+typedef metal::raytracing::intersector<triangle_data, primitive_motion>
+    metalrt_blas_intersector_type;
+#  else
+typedef acceleration_structure<> metalrt_blas_as_type;
+typedef intersection_function_table<triangle_data> metalrt_blas_ift_type;
+typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_type;
+#  endif

 #endif /* __METALRT__ */

@@ -282,6 +297,12 @@ struct Texture3DParamsMetal {
  texture3d<float, access::sample> tex;
 };

+#ifdef __METALRT__
+struct MetalRTBlasWrapper {
+  metalrt_blas_as_type blas;
+};
+#endif
+
 struct MetalAncillaries {
  device Texture2DParamsMetal *textures_2d;
  device Texture3DParamsMetal *textures_3d;
@@ -291,6 +312,9 @@ struct MetalAncillaries {
  metalrt_ift_type ift_default;
  metalrt_ift_type ift_shadow;
  metalrt_ift_type ift_local;
+  metalrt_blas_ift_type ift_local_prim;
+  constant MetalRTBlasWrapper *blas_accel_structs;
+  constant int *blas_userID_to_index_lookUp;
 #endif
 };

--- a/intern/cycles/kernel/device/metal/context_begin.h
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -34,7 +34,7 @@ class MetalKernelContext {
      kernel_assert(0);
      return 0;
    }
-    
+
 #ifdef __KERNEL_METAL_INTEL__
    template<typename TextureType, typename CoordsType>
    inline __attribute__((__always_inline__))
@@ -55,7 +55,7 @@ class MetalKernelContext {
      }
    }
 #endif
-    
+
    // texture2d
    template<>
    inline __attribute__((__always_inline__))
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -139,6 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
 #endif
 }

+[[intersection(triangle, triangle_data )]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_local_hit_tri_prim(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
+    uint primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
+{
+  //instance_id, aka the user_id has been removed. If we take this function we optimized the
+  //SSS for starting traversal from a primitive acceleration structure instead of the root of the global AS.
+  //this means we will always be intersecting the correct object no need for the userid to check
+  return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+      launch_params_metal, payload, payload.local_object, primitive_id, barycentrics, ray_tmax);
+}
 [[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
 __anyhit__cycles_metalrt_local_hit_tri(
    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
@@ -163,6 +177,17 @@ __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
  return result;
 }

+[[intersection(bounding_box, triangle_data )]] BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_local_hit_box_prim(const float ray_tmax [[max_distance]])
+{
+  /* unused function */
+  BoundingBoxIntersectionResult result;
+  result.distance = ray_tmax;
+  result.accept = false;
+  result.continue_search = false;
+  return result;
+}
+
 template<uint intersection_type>
 bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -195,7 +195,15 @@ using sycl::half;
 #define fmodf(x, y) sycl::fmod((x), (y))
 #define lgammaf(x) sycl::lgamma((x))

-#define cosf(x) sycl::native::cos(((float)(x)))
+/* `sycl::native::cos` precision is not sufficient and `-ffast-math` lets
+ * the current DPC++ compiler overload `sycl::cos` with it.
+ * We work around this issue by directly calling the SPIRV implementation which
+ * provides greater precision. */
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
+#  define cosf(x) __spirv_ocl_cos(((float)(x)))
+#else
+#  define cosf(x) sycl::cos(((float)(x)))
+#endif
 #define sinf(x) sycl::native::sin(((float)(x)))
 #define powf(x, y) sycl::native::powr(((float)(x)), ((float)(y)))
 #define tanf(x) sycl::native::tan(((float)(x)))
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -372,6 +372,16 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
          break;
        }
+        case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
+          break;
+        }
        case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
          oneapi_call(kg,
                      cgh,
--- a/intern/cycles/kernel/film/denoising_passes.h
+++ b/intern/cycles/kernel/film/denoising_passes.h
@@ -58,23 +58,7 @@ ccl_device_forceinline void film_write_denoising_features_surface(KernelGlobals
    normal += sc->N * sc->sample_weight;
    sum_weight += sc->sample_weight;

-    Spectrum closure_albedo = sc->weight;
-    /* Closures that include a Fresnel term typically have weights close to 1 even though their
-     * actual contribution is significantly lower.
-     * To account for this, we scale their weight by the average fresnel factor (the same is also
-     * done for the sample weight in the BSDF setup, so we don't need to scale that here). */
-    if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(sc->type)) {
-      ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)sc;
-      closure_albedo *= bsdf->extra->fresnel_color;
-    }
-    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_SHEEN_ID) {
-      ccl_private PrincipledSheenBsdf *bsdf = (ccl_private PrincipledSheenBsdf *)sc;
-      closure_albedo *= bsdf->avg_value;
-    }
-    else if (sc->type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
-      closure_albedo *= bsdf_principled_hair_albedo(sc);
-    }
-    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) {
+    if (sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) {
      /* BSSRDF already accounts for weight, retro-reflection would double up. */
      ccl_private const PrincipledDiffuseBsdf *bsdf = (ccl_private const PrincipledDiffuseBsdf *)
          sc;
@@ -83,6 +67,7 @@ ccl_device_forceinline void film_write_denoising_features_surface(KernelGlobals
      }
    }

+    Spectrum closure_albedo = bsdf_albedo(sd, sc);
    if (bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
      diffuse_albedo += closure_albedo;
      sum_nonspecular_weight += sc->sample_weight;
--- a/intern/cycles/kernel/geom/curve_intersect.h
+++ b/intern/cycles/kernel/geom/curve_intersect.h
@@ -720,7 +720,7 @@ ccl_device_inline void curve_shader_setup(KernelGlobals kg,
    const float3 tangent = normalize(dPdu);
    const float3 bitangent = normalize(cross(tangent, -D));
    const float sine = sd->v;
-    const float cosine = safe_sqrtf(1.0f - sine * sine);
+    const float cosine = cos_from_sin(sine);

    sd->N = normalize(sine * bitangent - cosine * normalize(cross(tangent, bitangent)));
 #  if 0
--- a/intern/cycles/kernel/integrator/mnee.h
+++ b/intern/cycles/kernel/integrator/mnee.h
@@ -704,9 +704,9 @@ ccl_device_forceinline bool mnee_compute_transfer_matrix(ccl_private const Shade
    float ilo = -eta * ilh;

    float cos_theta = dot(wo, m.n);
-    float sin_theta = safe_sqrtf(1.f - sqr(cos_theta));
+    float sin_theta = sin_from_cos(cos_theta);
    float cos_phi = dot(wo, s);
-    float sin_phi = safe_sqrtf(1.f - sqr(cos_phi));
+    float sin_phi = sin_from_cos(cos_phi);

    /* Wo = (cos_phi * sin_theta) * s + (sin_phi * sin_theta) * t + cos_theta * n. */
    float3 dH_dtheta = ilo * (cos_theta * (cos_phi * s + sin_phi * t) - sin_theta * m.n);
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -132,6 +132,9 @@ typedef struct IntegratorStateGPU {
  /* Index of main path which will be used by a next shadow catcher split.  */
  ccl_global int *next_main_path_index;

+  /* Partition/key offsets used when writing sorted active indices. */
+  ccl_global int *sort_partition_key_offsets;
+
  /* Divisor used to partition active indices by locality when sorting by material.  */
  uint sort_partition_divisor;
 } IntegratorStateGPU;
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -115,6 +115,13 @@ ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

@@ -130,6 +137,13 @@ ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

--- a/intern/cycles/kernel/integrator/subsurface_random_walk.h
+++ b/intern/cycles/kernel/integrator/subsurface_random_walk.h
@@ -136,7 +136,7 @@ ccl_device_forceinline float diffusion_length_dwivedi(float alpha)

 ccl_device_forceinline float3 direction_from_cosine(float3 D, float cos_theta, float randv)
 {
-  float sin_theta = safe_sqrtf(1.0f - cos_theta * cos_theta);
+  float sin_theta = sin_from_cos(cos_theta);
  float phi = M_2PI_F * randv;
  float3 dir = make_float3(sin_theta * cosf(phi), sin_theta * sinf(phi), cos_theta);

--- a/intern/cycles/kernel/integrator/surface_shader.h
+++ b/intern/cycles/kernel/integrator/surface_shader.h
@@ -621,7 +621,7 @@ ccl_device Spectrum surface_shader_diffuse(KernelGlobals kg, ccl_private const S
    ccl_private const ShaderClosure *sc = &sd->closure[i];

    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type))
-      eval += sc->weight;
+      eval += bsdf_albedo(sd, sc);
  }

  return eval;
@@ -635,7 +635,7 @@ ccl_device Spectrum surface_shader_glossy(KernelGlobals kg, ccl_private const Sh
    ccl_private const ShaderClosure *sc = &sd->closure[i];

    if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      eval += sc->weight;
+      eval += bsdf_albedo(sd, sc);
  }

  return eval;
@@ -649,7 +649,7 @@ ccl_device Spectrum surface_shader_transmission(KernelGlobals kg, ccl_private co
    ccl_private const ShaderClosure *sc = &sd->closure[i];

    if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
-      eval += sc->weight;
+      eval += bsdf_albedo(sd, sc);
  }

  return eval;
--- a/intern/cycles/kernel/light/area.h
+++ b/intern/cycles/kernel/light/area.h
@@ -102,7 +102,7 @@ ccl_device float area_light_spread_attenuation(const float3 D,
    /* The factor M_PI_F comes from integrating the radiance over the hemisphere */
    return (cos_a > 0.9999997f) ? M_PI_F : 0.0f;
  }
-  const float sin_a = safe_sqrtf(1.0f - sqr(cos_a));
+  const float sin_a = sin_from_cos(cos_a);
  const float tan_a = sin_a / cos_a;
  return max((tan_half_spread - tan_a) * normalize_spread, 0.0f);
 }
--- a/intern/cycles/kernel/light/spot.h
+++ b/intern/cycles/kernel/light/spot.h
@@ -7,24 +7,13 @@

 CCL_NAMESPACE_BEGIN

-ccl_device float spot_light_attenuation(float3 dir,
-                                        float cos_half_spot_angle,
-                                        float spot_smooth,
-                                        float3 N)
+ccl_device float spot_light_attenuation(const ccl_global KernelSpotLight *spot, float3 ray)
 {
-  float attenuation = dot(dir, N);
+  const float3 scaled_ray = safe_normalize(
+      make_float3(dot(ray, spot->axis_u), dot(ray, spot->axis_v), dot(ray, spot->dir)) /
+      spot->len);

-  if (attenuation <= cos_half_spot_angle) {
-    attenuation = 0.0f;
-  }
-  else {
-    float t = attenuation - cos_half_spot_angle;
-
-    if (t < spot_smooth && spot_smooth != 0.0f)
-      attenuation *= smoothstepf(t / spot_smooth);
-  }
-
-  return attenuation;
+  return smoothstepf((scaled_ray.z - spot->cos_half_spot_angle) / spot->spot_smooth);
 }

 template<bool in_volume_segment>
@@ -57,8 +46,7 @@ ccl_device_inline bool spot_light_sample(const ccl_global KernelLight *klight,
  ls->eval_fac = (0.25f * M_1_PI_F) * invarea;

  /* spot light attenuation */
-  ls->eval_fac *= spot_light_attenuation(
-      klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, -ls->D);
+  ls->eval_fac *= spot_light_attenuation(&klight->spot, -ls->D);
  if (!in_volume_segment && ls->eval_fac == 0.0f) {
    return false;
  }
@@ -87,8 +75,7 @@ ccl_device_forceinline void spot_light_update_position(const ccl_global KernelLi
  ls->pdf = invarea;

  /* spot light attenuation */
-  ls->eval_fac *= spot_light_attenuation(
-      klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, ls->Ng);
+  ls->eval_fac *= spot_light_attenuation(&klight->spot, ls->Ng);
 }

 ccl_device_inline bool spot_light_intersect(const ccl_global KernelLight *klight,
@@ -129,8 +116,7 @@ ccl_device_inline bool spot_light_sample_from_intersection(
  ls->pdf = invarea;

  /* spot light attenuation */
-  ls->eval_fac *= spot_light_attenuation(
-      klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, -ls->D);
+  ls->eval_fac *= spot_light_attenuation(&klight->spot, -ls->D);

  if (ls->eval_fac == 0.0f) {
    return false;
--- a/intern/cycles/kernel/light/tree.h
+++ b/intern/cycles/kernel/light/tree.h
@@ -47,11 +47,6 @@ ccl_device float light_tree_cos_bounding_box_angle(const BoundingBox bbox,
  return cos_theta_u;
 }

-ccl_device_forceinline float sin_from_cos(const float c)
-{
-  return safe_sqrtf(1.0f - sqr(c));
-}
-
 /* Compute vector v as in Fig .8. P_v is the corresponding point along the ray. */
 ccl_device float3 compute_v(
    const float3 centroid, const float3 P, const float3 D, const float3 bcone_axis, const float t)
--- a/intern/cycles/kernel/light/triangle.h
+++ b/intern/cycles/kernel/light/triangle.h
@@ -218,7 +218,7 @@ ccl_device_forceinline bool triangle_light_sample(KernelGlobals kg,
    /* Finally, select a random point along the edge of the new triangle
     * That point on the spherical triangle is the sampled ray direction */
    const float z = 1.0f - randv * (1.0f - dot(C_, B));
-    ls->D = z * B + safe_sqrtf(1.0f - z * z) * safe_normalize(C_ - dot(C_, B) * B);
+    ls->D = z * B + sin_from_cos(z) * safe_normalize(C_ - dot(C_, B) * B);

    /* calculate intersection with the planar triangle */
    if (!ray_triangle_intersect(
--- a/intern/cycles/kernel/osl/closures_setup.h
+++ b/intern/cycles/kernel/osl/closures_setup.h
@@ -209,14 +209,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
      closure->distribution == make_string("default", 4430693559278735917ull)) {
    if (!closure->refract) {
-      if (closure->alpha_x == closure->alpha_y) {
-        /* Isotropic */
-        sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
-      }
-      else {
-        /* Anisotropic */
-        sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
-      }
+      sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
    }
    else {
      sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
@@ -225,14 +218,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  /* Beckmann */
  else {
    if (!closure->refract) {
-      if (closure->alpha_x == closure->alpha_y) {
-        /* Isotropic */
-        sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
-      }
-      else {
-        /* Anisotropic */
-        sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
-      }
+      sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
    }
    else {
      sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
@@ -258,9 +244,9 @@ ccl_device void osl_closure_microfacet_ggx_setup(
  }

  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
-  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;

-  sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
+  sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 }

 ccl_device void osl_closure_microfacet_ggx_aniso_setup(
@@ -345,7 +331,6 @@ ccl_device void osl_closure_microfacet_ggx_fresnel_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = zero_float3();

@@ -383,7 +368,6 @@ ccl_device void osl_closure_microfacet_ggx_aniso_fresnel_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = closure->T;

@@ -426,7 +410,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = zero_spectrum();
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = zero_float3();

@@ -467,7 +450,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_glass_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = zero_spectrum();
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = zero_float3();

@@ -508,7 +490,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_aniso_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = zero_spectrum();
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = closure->T;

@@ -551,7 +532,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_fresnel_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = zero_float3();

@@ -592,7 +572,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_glass_fresnel_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = zero_float3();

@@ -633,7 +612,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_aniso_fresnel_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = closure->T;

@@ -660,9 +638,9 @@ ccl_device void osl_closure_microfacet_beckmann_setup(
  }

  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
-  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;

-  sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
+  sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 }

 ccl_device void osl_closure_microfacet_beckmann_aniso_setup(
@@ -865,27 +843,18 @@ ccl_device void osl_closure_principled_clearcoat_setup(
    float3 weight,
    ccl_private const PrincipledClearcoatClosure *closure)
 {
+  weight *= 0.25f * closure->clearcoat;
  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
  if (!bsdf) {
    return;
  }

-  MicrofacetExtra *extra = (MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
-  if (!extra) {
-    return;
-  }
-
  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
  bsdf->alpha_x = closure->clearcoat_roughness;
  bsdf->alpha_y = closure->clearcoat_roughness;
  bsdf->ior = 1.5f;

-  bsdf->extra = extra;
-  bsdf->extra->color = zero_spectrum();
-  bsdf->extra->cspec0 = make_spectrum(0.04f);
-  bsdf->extra->clearcoat = closure->clearcoat;
-
  bsdf->T = zero_float3();

  sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
--- a/intern/cycles/kernel/osl/osl.h
+++ b/intern/cycles/kernel/osl/osl.h
@@ -161,7 +161,10 @@ ccl_device_inline void osl_eval_nodes(KernelGlobals kg,
                        /* shadeindex = */ 0);
 #  endif

-  if (globals.Ci) {
+  if constexpr (type == SHADER_TYPE_DISPLACEMENT) {
+    sd->P = globals.P;
+  }
+  else if (globals.Ci) {
    flatten_closure_tree(kg, sd, path_flag, globals.Ci);
  }
 }
--- a/intern/cycles/kernel/sample/mapping.h
+++ b/intern/cycles/kernel/sample/mapping.h
@@ -67,17 +67,18 @@ ccl_device_inline void sample_uniform_cone(const float3 N,
                                           ccl_private float3 *wo,
                                           ccl_private float *pdf)
 {
-  float zMin = cosf(angle);
-  float z = zMin - zMin * randu + randu;
-  float r = safe_sqrtf(1.0f - sqr(z));
-  float phi = M_2PI_F * randv;
-  float x = r * cosf(phi);
-  float y = r * sinf(phi);
+  const float cosThetaMin = cosf(angle);
+  const float cosTheta = mix(cosThetaMin, 1.0f, randu);
+  const float sinTheta = sin_from_cos(cosTheta);
+  const float phi = M_2PI_F * randv;
+  const float x = sinTheta * cosf(phi);
+  const float y = sinTheta * sinf(phi);
+  const float z = cosTheta;

  float3 T, B;
  make_orthonormals(N, &T, &B);
  *wo = x * T + y * B + z * N;
-  *pdf = M_1_2PI_F / (1.0f - zMin);
+  *pdf = M_1_2PI_F / (1.0f - cosThetaMin);
 }

 ccl_device_inline float pdf_uniform_cone(const float3 N, float3 D, float angle)
--- a/intern/cycles/kernel/svm/brick.h
+++ b/intern/cycles/kernel/svm/brick.h
@@ -46,17 +46,8 @@ ccl_device_noinline_cpu float2 svm_brick(float3 p,
  float tint = saturatef((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias));
  float min_dist = min(min(x, y), min(brick_width - x, row_height - y));

-  float mortar;
-  if (min_dist >= mortar_size) {
-    mortar = 0.0f;
-  }
-  else if (mortar_smooth == 0.0f) {
-    mortar = 1.0f;
-  }
-  else {
-    min_dist = 1.0f - min_dist / mortar_size;
-    mortar = (min_dist < mortar_smooth) ? smoothstepf(min_dist / mortar_smooth) : 1.0f;
-  }
+  min_dist = 1.0f - min_dist / mortar_size;
+  float mortar = smoothstepf(min_dist / mortar_smooth);

  return make_float2(tint, mortar);
 }
--- a/intern/cycles/kernel/svm/closure.h
+++ b/intern/cycles/kernel/svm/closure.h
@@ -333,7 +333,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
            bsdf->extra->cspec0 = rgb_to_spectrum(
                (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic);
            bsdf->extra->color = rgb_to_spectrum(base_color);
-            bsdf->extra->clearcoat = 0.0f;

            /* setup bsdf */
            if (distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID ||
@@ -383,7 +382,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,

                bsdf->extra->color = rgb_to_spectrum(base_color);
                bsdf->extra->cspec0 = rgb_to_spectrum(cspec0);
-                bsdf->extra->clearcoat = 0.0f;

                /* setup bsdf */
                sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
@@ -440,7 +438,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,

              bsdf->extra->color = rgb_to_spectrum(base_color);
              bsdf->extra->cspec0 = rgb_to_spectrum(cspec0);
-              bsdf->extra->clearcoat = 0.0f;

              /* setup bsdf */
              sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
@@ -455,30 +452,20 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
 #ifdef __CAUSTICS_TRICKS__
      if (kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
 #endif
-        if (clearcoat > CLOSURE_WEIGHT_CUTOFF) {
-          ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
-              sd, sizeof(MicrofacetBsdf), weight);
-          ccl_private MicrofacetExtra *extra =
-              (bsdf != NULL) ?
-                  (ccl_private MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra)) :
-                  NULL;
+        Spectrum clearcoat_weight = 0.25f * clearcoat * weight;
+        ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+            sd, sizeof(MicrofacetBsdf), clearcoat_weight);

-          if (bsdf && extra) {
-            bsdf->N = clearcoat_normal;
-            bsdf->T = zero_float3();
-            bsdf->ior = 1.5f;
-            bsdf->extra = extra;
+        if (bsdf) {
+          bsdf->N = clearcoat_normal;
+          bsdf->T = zero_float3();
+          bsdf->ior = 1.5f;

-            bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
-            bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
+          bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
+          bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;

-            bsdf->extra->color = zero_spectrum();
-            bsdf->extra->cspec0 = make_spectrum(0.04f);
-            bsdf->extra->clearcoat = clearcoat;
-
-            /* setup bsdf */
-            sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
-          }
+          /* setup bsdf */
+          sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
        }
 #ifdef __CAUSTICS_TRICKS__
      }
@@ -584,7 +571,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
        if (bsdf->extra) {
          bsdf->extra->color = rgb_to_spectrum(stack_load_float3(stack, data_node.w));
          bsdf->extra->cspec0 = zero_spectrum();
-          bsdf->extra->clearcoat = 0.0f;
          sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
        }
      }
@@ -724,7 +710,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
      kernel_assert(stack_valid(data_node.z));
      bsdf->extra->color = rgb_to_spectrum(stack_load_float3(stack, data_node.z));
      bsdf->extra->cspec0 = zero_spectrum();
-      bsdf->extra->clearcoat = 0.0f;

      /* setup bsdf */
      sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
--- a/intern/cycles/kernel/svm/types.h
+++ b/intern/cycles/kernel/svm/types.h
@@ -489,8 +489,7 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BSDF_MICROFACET_FRESNEL(type) \
  (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID || \
   type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID || \
-   type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || \
-   type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID)
+   type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID)
 #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
 #define CLOSURE_IS_BSSRDF(type) \
  (type >= CLOSURE_BSSRDF_BURLEY_ID && type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -74,7 +74,8 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME__

 /* TODO: solve internal compiler errors and enable light tree on HIP. */
-#ifdef __KERNEL_HIP__
+/* TODO: solve internal compiler perf issue and enable light tree on Metal/AMD. */
+#if defined(__KERNEL_HIP__) || defined(__KERNEL_METAL_AMD__)
 #  undef __LIGHT_TREE__
 #endif

@@ -1290,12 +1291,14 @@ typedef struct KernelCurveSegment {
 static_assert_align(KernelCurveSegment, 8);

 typedef struct KernelSpotLight {
+  packed_float3 axis_u;
  float radius;
+  packed_float3 axis_v;
  float invarea;
-  float cos_half_spot_angle;
-  float spot_smooth;
  packed_float3 dir;
-  float pad;
+  float cos_half_spot_angle;
+  packed_float3 len;
+  float spot_smooth;
 } KernelSpotLight;

 /* PointLight is SpotLight with only radius and invarea being used. */
@@ -1506,6 +1509,8 @@ typedef enum DeviceKernel : int {
  DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
+  DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -13,6 +13,7 @@
 #include "scene/light.h"
 #include "scene/mesh.h"
 #include "scene/object.h"
+#include "scene/osl.h"
 #include "scene/pointcloud.h"
 #include "scene/scene.h"
 #include "scene/shader.h"
@@ -23,7 +24,9 @@
 #include "subd/patch_table.h"
 #include "subd/split.h"

-#include "kernel/osl/globals.h"
+#ifdef WITH_OSL
+#  include "kernel/osl/globals.h"
+#endif

 #include "util/foreach.h"
 #include "util/log.h"
@@ -306,6 +309,11 @@ void GeometryManager::update_osl_globals(Device *device, Scene *scene)
 {
 #ifdef WITH_OSL
  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
+  if (og == nullptr) {
+    /* Can happen when rendering with multiple GPUs, but no CPU (in which case the name maps filled
+     * below are not used anyway) */
+    return;
+  }

  og->object_name_map.clear();
  og->object_names.clear();
@@ -1666,6 +1674,7 @@ void GeometryManager::device_update_displacement_images(Device *device,
  TaskPool pool;
  ImageManager *image_manager = scene->image_manager;
  set<int> bump_images;
+  bool has_osl_node = false;
  foreach (Geometry *geom, scene->geometry) {
    if (geom->is_modified()) {
      /* Geometry-level check for hair shadow transparency.
@@ -1685,6 +1694,9 @@ void GeometryManager::device_update_displacement_images(Device *device,
          continue;
        }
        foreach (ShaderNode *node, shader->graph->nodes) {
+          if (node->special_type == SHADER_SPECIAL_TYPE_OSL) {
+            has_osl_node = true;
+          }
          if (node->special_type != SHADER_SPECIAL_TYPE_IMAGE_SLOT) {
            continue;
          }
@@ -1700,6 +1712,15 @@ void GeometryManager::device_update_displacement_images(Device *device,
      }
    }
  }
+
+#ifdef WITH_OSL
+  /* If any OSL node is used for displacement, it may reference a texture. But it's
+   * unknown which ones, so have to load them all. */
+  if (has_osl_node) {
+    OSLShaderManager::osl_image_slots(device, image_manager, bump_images);
+  }
+#endif
+
  foreach (int slot, bump_images) {
    pool.push(function_bind(
        &ImageManager::device_update_slot, image_manager, device, scene, slot, &progress));
--- a/Show More
+++ b/Show More