USD: support importing dome light textures.

USD import fix: set active mesh color.
Fixed a bug where the active color wasn't being set on imported meshes, resulting in no colors displaying in the viewport.
2023-03-17 17:49:26 -04:00 · 2023-03-15 23:33:06 -04:00 · 2023-03-13 13:00:08 -04:00 · 2023-03-12 15:05:17 -04:00 · 2023-03-11 13:44:08 -05:00 · 2023-03-10 20:54:08 -05:00
1946 changed files with 2081842 additions and 13269 deletions
--- a/.arcconfig
+++ b/.arcconfig
@@ -1,8 +0,0 @@
 {
 	"project_id" : "Blender",
 	"conduit_uri" : "https://developer.blender.org/",
 	"phabricator.uri" : "https://developer.blender.org/",
 	"git.default-relative-commit" : "origin/master",
 	"arc.land.update.default" : "rebase",
 	"arc.land.onto.default" : "master"
 }
--- a/.clang-format
+++ b/.clang-format
@@ -236,6 +236,8 @@ ForEachMacros:
  - LOOP_UNSELECTED_POINTS
  - LOOP_VISIBLE_KEYS
  - LOOP_VISIBLE_POINTS
  - LIGHT_FOREACH_BEGIN_DIRECTIONAL
  - LIGHT_FOREACH_BEGIN_LOCAL
  - LISTBASE_CIRCULAR_BACKWARD_BEGIN
  - LISTBASE_CIRCULAR_FORWARD_BEGIN
  - LISTBASE_FOREACH
--- a/.gitea/issue_template/bug.yaml
+++ b/.gitea/issue_template/bug.yaml
@@ -1,9 +1,9 @@
 name: Bug Report
 about: File a bug report
 labels:
-  - "type::Report"
+  - "Type/Report"
-  - "status::Needs Triage"
+  - "Status/Needs Triage"
-  - "priority::Normal"
+  - "Priority/Normal"
 body:
  - type: markdown
    attributes:
--- a/.gitea/issue_template/design.yaml
+++ b/.gitea/issue_template/design.yaml
@@ -1,7 +1,7 @@
 name: Design
 about: Create a design task (for developers only)
 labels:
-  - "type::Design"
+  - "Type/Design"
 body:
  - type: textarea
    id: body
--- a/.gitea/issue_template/todo.yaml
+++ b/.gitea/issue_template/todo.yaml
@@ -1,7 +1,7 @@
 name: To Do
 about: Create a to do task (for developers only)
 labels:
-  - "type::To Do"
+  - "Type/To Do"
 body:
  - type: textarea
    id: body
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,5 +1,4 @@
-This repository is only used as a mirror of git.blender.org. Blender development happens on
+This repository is only used as a mirror. Blender development happens on projects.blender.org.
 https://developer.blender.org.
 To get started with contributing code, please see:
 https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -15,8 +15,7 @@ staleLabel: stale
 # Comment to post when closing a stale Issue or Pull Request.
 closeComment: >
  This issue has been automatically closed, because this repository is only
-  used as a mirror of git.blender.org. Blender development happens on
+  used as a mirror. Blender development happens on projects.blender.org.
  developer.blender.org.
  To get started contributing code, please read:
  https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.gitignore
+++ b/.gitignore
@@ -39,7 +39,7 @@ Desktop.ini
 /doc/python_api/rst/bmesh.ops.rst
 # in-source lib downloads
-/build_files/build_environment/downloads
+/build_files/build_environment/downloads/
 # in-source buildbot signing configuration
 /build_files/buildbot/codesign/config_server.py
@@ -48,4 +48,20 @@ Desktop.ini
 waveletNoiseTile.bin
 # testing environment
-/Testing
+/Testing/
 # Translations.
 /locale/user-config.py
 # External repositories.
 /scripts/addons/
 /scripts/addons_contrib/
 # Ignore old submodules directories.
 # Eventually need to get rid of those, but for the first time of transition
 # avoid indidents when the folders exists after bisect and developers staging
 # them by accident.
 /release/scripts/addons/
 /release/datafiles/locale/
 /release/scripts/addons_contrib/
 /source/tools/
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,20 +0,0 @@
 [submodule "release/scripts/addons"]
 	path = release/scripts/addons
 	url = ../blender-addons.git
 	branch = master
 	ignore = all
 [submodule "release/scripts/addons_contrib"]
 	path = release/scripts/addons_contrib
 	url = ../blender-addons-contrib.git
 	branch = master
 	ignore = all
 [submodule "release/datafiles/locale"]
 	path = release/datafiles/locale
 	url = ../blender-translations.git
 	branch = master
 	ignore = all
 [submodule "source/tools"]
 	path = source/tools
 	url = ../blender-dev-tools.git
 	branch = master
 	ignore = all
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -524,7 +524,7 @@ endif()
 if(NOT APPLE)
  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" ON)
  option(WITH_CYCLES_HIP_BINARIES      "Build Cycles AMD HIP binaries" OFF)
-  set(CYCLES_HIP_BINARIES_ARCH gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
+  set(CYCLES_HIP_BINARIES_ARCH gfx900 gfx906 gfx90c gfx902 gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
  mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
  mark_as_advanced(CYCLES_HIP_BINARIES_ARCH)
 endif()
@@ -625,8 +625,10 @@ mark_as_advanced(
 # Vulkan
 option(WITH_VULKAN_BACKEND "Enable Vulkan as graphics backend (only for development)" OFF)
 option(WITH_VULKAN_GUARDEDALLOC "Use guardedalloc for host allocations done inside Vulkan (development option)" OFF)
 mark_as_advanced(
  WITH_VULKAN_BACKEND
  WITH_VULKAN_GUARDEDALLOC
 )
 # Metal
@@ -952,21 +954,6 @@ endif()
 # -----------------------------------------------------------------------------
 # Check if Sub-modules are Cloned
 if(WITH_INTERNATIONAL)
  file(GLOB RESULT "${CMAKE_SOURCE_DIR}/release/datafiles/locale")
  list(LENGTH RESULT DIR_LEN)
  if(DIR_LEN EQUAL 0)
    message(
      WARNING
      "Translation path '${CMAKE_SOURCE_DIR}/release/datafiles/locale' is missing, "
      "This is a 'git submodule', which are known not to work with bridges to other version "
      "control systems."
    )
    set(TRANSLATIONS_FOUND OFF)
    set_and_warn_library_found("Translations" TRANSLATIONS_FOUND WITH_INTERNATIONAL)
  endif()
 endif()
 if(WITH_PYTHON)
  # While we have this as an '#error' in 'bpy_capi_utils.h',
  # upgrading Python tends to cause confusion for users who build.
@@ -982,14 +969,14 @@ if(WITH_PYTHON)
    )
  endif()
-  file(GLOB RESULT "${CMAKE_SOURCE_DIR}/release/scripts/addons")
+  file(GLOB RESULT "${CMAKE_SOURCE_DIR}/scripts/addons")
  list(LENGTH RESULT DIR_LEN)
  if(DIR_LEN EQUAL 0)
    message(
      WARNING
-      "Addons path '${CMAKE_SOURCE_DIR}/release/scripts/addons' is missing, "
+      "Addons path '${CMAKE_SOURCE_DIR}/scripts/addons' is missing. "
-      "This is a 'git submodule', which are known not to work with bridges to other version "
+      "This is an external repository which needs to be checked out. Use `make update` to do so. "
-      "control systems: * CONTINUING WITHOUT ADDONS *"
+      "* CONTINUING WITHOUT ADDONS *"
    )
  endif()
 endif()
--- a/36
+++ b/36
@@ -69,7 +69,7 @@ Static Source Code Checking
   * check_cmake:           Runs our own cmake file checker which detects errors in the cmake file list definitions.
   * check_pep8:            Checks all Python script are pep8 which are tagged to use the stricter formatting.
   * check_mypy:            Checks all Python scripts using mypy,
-                            see: source/tools/check_source/check_mypy_config.py scripts which are included.
+                            see: tools/check_source/check_mypy_config.py scripts which are included.
 Documentation Checking
@@ -85,7 +85,7 @@ Spell Checkers
   * check_spelling_osl:    Check for spelling errors (OSL only).
   * check_spelling_py:     Check for spelling errors (Python only).
-   Note: an additional word-list is maintained at: 'source/tools/check_source/check_spelling_c_config.py'
+   Note: an additional word-list is maintained at: 'tools/check_source/check_spelling_c_config.py'
   Note: that spell checkers can take a 'CHECK_SPELLING_CACHE' filepath argument,
   so re-running does not need to re-check unchanged files.
@@ -299,7 +299,11 @@ else
 	ifneq ("$(wildcard $(DEPS_BUILD_DIR)/build.ninja)","")
 		DEPS_BUILD_COMMAND:=ninja
 	else
-		DEPS_BUILD_COMMAND:=make -s
+		ifeq ($(OS), Darwin)
 			DEPS_BUILD_COMMAND:=make -s
 		else
 			DEPS_BUILD_COMMAND:="$(BLENDER_DIR)/build_files/build_environment/linux/make_deps_wrapper.sh" -s
 		endif
 	endif
 endif
@@ -398,7 +402,7 @@ endif
 deps: .FORCE
 	@echo
-	@echo Configuring dependencies in \"$(DEPS_BUILD_DIR)\"
+	@echo Configuring dependencies in \"$(DEPS_BUILD_DIR)\", install to \"$(DEPS_INSTALL_DIR)\"
 	@cmake -H"$(DEPS_SOURCE_DIR)" \
 	       -B"$(DEPS_BUILD_DIR)" \
@@ -486,22 +490,22 @@ check_smatch: .FORCE
 	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_smatch.py"
 check_mypy: .FORCE
-	@$(PYTHON) "$(BLENDER_DIR)/source/tools/check_source/check_mypy.py"
+	@$(PYTHON) "$(BLENDER_DIR)/tools/check_source/check_mypy.py"
 check_wiki_file_structure: .FORCE
 	@PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    "$(BLENDER_DIR)/source/tools/check_wiki/check_wiki_file_structure.py"
+	    "$(BLENDER_DIR)/tools/check_wiki/check_wiki_file_structure.py"
 check_spelling_py: .FORCE
 	@cd "$(BUILD_DIR)" ; \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    "$(BLENDER_DIR)/source/tools/check_source/check_spelling.py" \
+	    "$(BLENDER_DIR)/tools/check_source/check_spelling.py" \
-	    "$(BLENDER_DIR)/release/scripts"
+	    "$(BLENDER_DIR)/scripts"
 check_spelling_c: .FORCE
 	@cd "$(BUILD_DIR)" ; \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    "$(BLENDER_DIR)/source/tools/check_source/check_spelling.py" \
+	    "$(BLENDER_DIR)/tools/check_source/check_spelling.py" \
 	    --cache-file=$(CHECK_SPELLING_CACHE) \
 	    "$(BLENDER_DIR)/source" \
 	    "$(BLENDER_DIR)/intern/cycles" \
@@ -511,21 +515,21 @@ check_spelling_c: .FORCE
 check_spelling_osl: .FORCE
 	@cd "$(BUILD_DIR)" ; \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    "$(BLENDER_DIR)/source/tools/check_source/check_spelling.py" \
+	    "$(BLENDER_DIR)/tools/check_source/check_spelling.py" \
 	    --cache-file=$(CHECK_SPELLING_CACHE) \
 	    "$(BLENDER_DIR)/intern/cycles/kernel/shaders"
 check_descriptions: .FORCE
 	@$(BLENDER_BIN) --background -noaudio --factory-startup --python \
-	    "$(BLENDER_DIR)/source/tools/check_source/check_descriptions.py"
+	    "$(BLENDER_DIR)/tools/check_source/check_descriptions.py"
 check_deprecated: .FORCE
 	@PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    source/tools/check_source/check_deprecated.py
+	    tools/check_source/check_deprecated.py
 check_licenses: .FORCE
 	@PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    "$(BLENDER_DIR)/source/tools/check_source/check_licenses.py" \
+	    "$(BLENDER_DIR)/tools/check_source/check_licenses.py" \
 	    "--show-headers=$(SHOW_HEADERS)"
 check_pep8: .FORCE
@@ -534,7 +538,7 @@ check_pep8: .FORCE
 check_cmake: .FORCE
 	@PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    source/tools/check_source/check_cmake_consistency.py
+	    tools/check_source/check_cmake_consistency.py
 # -----------------------------------------------------------------------------
@@ -572,8 +576,8 @@ update_code: .FORCE
 	@$(PYTHON) ./build_files/utils/make_update.py --no-libraries
 format: .FORCE
-	@PATH="${LIBDIR}/llvm/bin/:$(PATH)" $(PYTHON) source/tools/utils_maintenance/clang_format_paths.py $(PATHS)
+	@PATH="${LIBDIR}/llvm/bin/:$(PATH)" $(PYTHON) tools/utils_maintenance/clang_format_paths.py $(PATHS)
-	@$(PYTHON) source/tools/utils_maintenance/autopep8_format_paths.py --autopep8-command="$(AUTOPEP8)" $(PATHS)
+	@$(PYTHON) tools/utils_maintenance/autopep8_format_paths.py --autopep8-command="$(AUTOPEP8)" $(PATHS)
 # -----------------------------------------------------------------------------
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Development
 -----------
 - [Build Instructions](https://wiki.blender.org/wiki/Building_Blender)
- [Code Review & Bug Tracker](https://developer.blender.org)
+- [Code Review & Bug Tracker](https://projects.blender.org)
 - [Developer Forum](https://devtalk.blender.org)
 - [Developer Documentation](https://wiki.blender.org)
--- a/build_files/build_environment/cmake/epoxy.cmake
+++ b/build_files/build_environment/cmake/epoxy.cmake
@@ -10,7 +10,7 @@ ExternalProject_Add(external_epoxy
  URL_HASH ${EPOXY_HASH_TYPE}=${EPOXY_HASH}
  PREFIX ${BUILD_DIR}/epoxy
  PATCH_COMMAND ${PATCH_CMD} -p 1 -N -d ${BUILD_DIR}/epoxy/src/external_epoxy/ < ${PATCH_DIR}/epoxy.diff
-  CONFIGURE_COMMAND ${CONFIGURE_ENV} && ${MESON} setup --prefix ${LIBDIR}/epoxy --default-library ${EPOXY_LIB_TYPE} --libdir lib ${BUILD_DIR}/epoxy/src/external_epoxy-build ${BUILD_DIR}/epoxy/src/external_epoxy -Dtests=false
+  CONFIGURE_COMMAND ${CONFIGURE_ENV} && ${MESON} setup --prefix ${LIBDIR}/epoxy --default-library ${EPOXY_LIB_TYPE} --libdir lib ${BUILD_DIR}/epoxy/src/external_epoxy-build ${BUILD_DIR}/epoxy/src/external_epoxy -Dtests=false ${MESON_BUILD_TYPE}
  BUILD_COMMAND ninja
  INSTALL_COMMAND ninja install
 )
--- a/build_files/build_environment/cmake/fribidi.cmake
+++ b/build_files/build_environment/cmake/fribidi.cmake
@@ -9,7 +9,7 @@ ExternalProject_Add(external_fribidi
  URL_HASH ${FRIBIDI_HASH_TYPE}=${FRIBIDI_HASH}
  DOWNLOAD_DIR ${DOWNLOAD_DIR}
  PREFIX ${BUILD_DIR}/fribidi
-  CONFIGURE_COMMAND ${MESON} setup --prefix ${LIBDIR}/fribidi -Ddocs=false --default-library static --libdir lib ${BUILD_DIR}/fribidi/src/external_fribidi-build ${BUILD_DIR}/fribidi/src/external_fribidi
+  CONFIGURE_COMMAND ${MESON} setup --prefix ${LIBDIR}/fribidi ${MESON_BUILD_TYPE} -Ddocs=false --default-library static --libdir lib ${BUILD_DIR}/fribidi/src/external_fribidi-build ${BUILD_DIR}/fribidi/src/external_fribidi
  BUILD_COMMAND ninja
  INSTALL_COMMAND ninja install
  INSTALL_DIR ${LIBDIR}/fribidi
--- a/build_files/build_environment/cmake/gmp.cmake
+++ b/build_files/build_environment/cmake/gmp.cmake
@@ -22,7 +22,7 @@ elseif(UNIX AND NOT APPLE)
  )
 endif()
-# Boolean crashes with Arm assembly, see T103423.
+# Boolean crashes with Arm assembly, see #103423.
 if(BLENDER_PLATFORM_ARM)
  set(GMP_OPTIONS
    ${GMP_OPTIONS}
--- a/build_files/build_environment/cmake/harfbuzz.cmake
+++ b/build_files/build_environment/cmake/harfbuzz.cmake
@@ -21,6 +21,7 @@ set(HARFBUZZ_EXTRA_OPTIONS
  # Only used for command line utilities,
  # disable as this would add an addition & unnecessary build-dependency.
  -Dcairo=disabled
  ${MESON_BUILD_TYPE}
 )
 ExternalProject_Add(external_harfbuzz
@@ -59,3 +60,10 @@ if(BUILD_MODE STREQUAL Release AND WIN32)
    DEPENDEES install
  )
 endif()
 if(BUILD_MODE STREQUAL Debug AND WIN32)
  ExternalProject_Add_Step(external_harfbuzz after_install
    COMMAND ${CMAKE_COMMAND} -E copy ${LIBDIR}/harfbuzz/lib/libharfbuzz.a ${HARVEST_TARGET}/harfbuzz/lib/libharfbuzz_d.lib
    DEPENDEES install
  )
 endif()
--- a/build_files/build_environment/cmake/igc.cmake
+++ b/build_files/build_environment/cmake/igc.cmake
@@ -40,7 +40,8 @@ ExternalProject_Add(external_igc_llvm
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0004-OpenCL-support-cl_ext_float_atomics.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0005-OpenCL-Add-cl_khr_integer_dot_product.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0001-Memory-leak-fix-for-Managed-Static-Mutex.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch
+    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0003-Add-missing-include-limit-in-benchmark.patch
 )
 add_dependencies(
  external_igc_llvm
@@ -55,9 +56,6 @@ ExternalProject_Add(external_igc_spirv_translator
  CONFIGURE_COMMAND echo .
  BUILD_COMMAND echo .
  INSTALL_COMMAND echo .
  PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0001-update-SPIR-V-headers-for-SPV_INTEL_split_barrier.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0002-Add-support-for-split-barriers-extension-SPV_INTEL_s.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0003-Support-cl_bf16_conversions.patch
 )
 add_dependencies(
  external_igc_spirv_translator
--- a/build_files/build_environment/cmake/mesa.cmake
+++ b/build_files/build_environment/cmake/mesa.cmake
@@ -15,7 +15,7 @@ llvm-config = '${LIBDIR}/llvm/bin/llvm-config'"
 )
 set(MESA_EXTRA_FLAGS
-  -Dbuildtype=release
+  ${MESON_BUILD_TYPE}
  -Dc_args=${MESA_CFLAGS}
  -Dcpp_args=${MESA_CXXFLAGS}
  -Dc_link_args=${MESA_LDFLAGS}
--- a/build_files/build_environment/cmake/openvdb.cmake
+++ b/build_files/build_environment/cmake/openvdb.cmake
@@ -44,13 +44,21 @@ set(OPENVDB_EXTRA_ARGS
  # -DLLVM_DIR=${LIBDIR}/llvm/lib/cmake/llvm
 )
 set(OPENVDB_PATCH ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb.diff)
 if(APPLE)
  set(OPENVDB_PATCH
    ${OPENVDB_PATCH} &&
    ${PATCH_CMD} -p 0 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb_metal.diff
  )
 endif()
 ExternalProject_Add(openvdb
  URL file://${PACKAGE_DIR}/${OPENVDB_FILE}
  DOWNLOAD_DIR ${DOWNLOAD_DIR}
  URL_HASH ${OPENVDB_HASH_TYPE}=${OPENVDB_HASH}
  CMAKE_GENERATOR ${PLATFORM_ALT_GENERATOR}
  PREFIX ${BUILD_DIR}/openvdb
-  PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb.diff
+  PATCH_COMMAND ${OPENVDB_PATCH}
  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/openvdb ${DEFAULT_CMAKE_FLAGS} ${OPENVDB_EXTRA_ARGS}
  INSTALL_DIR ${LIBDIR}/openvdb
 )
--- a/build_files/build_environment/cmake/options.cmake
+++ b/build_files/build_environment/cmake/options.cmake
@@ -16,8 +16,10 @@ message("BuildMode = ${BUILD_MODE}")
 if(BUILD_MODE STREQUAL "Debug")
  set(LIBDIR ${CMAKE_CURRENT_BINARY_DIR}/Debug)
  set(MESON_BUILD_TYPE -Dbuildtype=debug)
 else()
  set(LIBDIR ${CMAKE_CURRENT_BINARY_DIR}/Release)
  set(MESON_BUILD_TYPE -Dbuildtype=release)
 endif()
 set(DOWNLOAD_DIR "${CMAKE_CURRENT_BINARY_DIR}/downloads" CACHE STRING "Path for downloaded files")
--- a/build_files/build_environment/cmake/python.cmake
+++ b/build_files/build_environment/cmake/python.cmake
@@ -88,6 +88,19 @@ else()
    export LDFLAGS=${PYTHON_LDFLAGS} &&
    export PKG_CONFIG_PATH=${LIBDIR}/ffi/lib/pkgconfig)
  # NOTE: untested on APPLE so far.
  if(NOT APPLE)
    set(PYTHON_CONFIGURE_EXTRA_ARGS
      ${PYTHON_CONFIGURE_EXTRA_ARGS}
      # Used on most release Linux builds (Fedora for e.g.),
      # increases build times noticeably with the benefit of a modest speedup at runtime.
      --enable-optimizations
      # While LTO is OK when building on the same system, it's incompatible across GCC versions,
      # making it impractical for developers to build against, so keep it disabled.
      # `--with-lto`
    )
  endif()
  ExternalProject_Add(external_python
    URL file://${PACKAGE_DIR}/${PYTHON_FILE}
    DOWNLOAD_DIR ${DOWNLOAD_DIR}
--- a/build_files/build_environment/cmake/versions.cmake
+++ b/build_files/build_environment/cmake/versions.cmake
@@ -668,9 +668,9 @@ set(SPIRV_HEADERS_FILE SPIR-V-Headers-${SPIRV_HEADERS_VERSION}.tar.gz)
 # compiler, the versions used are taken from the following location
 # https://github.com/intel/intel-graphics-compiler/releases
-set(IGC_VERSION 1.0.12149.1)
+set(IGC_VERSION 1.0.13064.7)
 set(IGC_URI https://github.com/intel/intel-graphics-compiler/archive/refs/tags/igc-${IGC_VERSION}.tar.gz)
-set(IGC_HASH 44f67f24e3bc5130f9f062533abf8154782a9d0a992bc19b498639a8521ae836)
+set(IGC_HASH a929abd4cca2b293961ec0437ee4b3b2147bd3b2c8a3c423af78c0c359b2e5ae)
 set(IGC_HASH_TYPE SHA256)
 set(IGC_FILE igc-${IGC_VERSION}.tar.gz)
@@ -690,15 +690,15 @@ set(IGC_LLVM_FILE ${IGC_LLVM_VERSION}.tar.gz)
 #
 # WARNING WARNING WARNING
-set(IGC_OPENCL_CLANG_VERSION 363a5262d8c7cff3fb28f3bdb5d85c8d7e91c1bb)
+set(IGC_OPENCL_CLANG_VERSION ee31812ea8b89d08c2918f045d11a19bd33525c5)
 set(IGC_OPENCL_CLANG_URI https://github.com/intel/opencl-clang/archive/${IGC_OPENCL_CLANG_VERSION}.tar.gz)
-set(IGC_OPENCL_CLANG_HASH aa8cf72bb239722ce8ce44f79413c6887ecc8ca18477dd520aa5c4809756da9a)
+set(IGC_OPENCL_CLANG_HASH 1db6735bbcfaa31e8a9ba39f121d6bafa806ea8919e9f56782d6aaa67771ddda)
 set(IGC_OPENCL_CLANG_HASH_TYPE SHA256)
 set(IGC_OPENCL_CLANG_FILE opencl-clang-${IGC_OPENCL_CLANG_VERSION}.tar.gz)
-set(IGC_VCINTRINSICS_VERSION v0.5.0)
+set(IGC_VCINTRINSICS_VERSION v0.11.0)
 set(IGC_VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/refs/tags/${IGC_VCINTRINSICS_VERSION}.tar.gz)
-set(IGC_VCINTRINSICS_HASH 70bb47c5e32173cf61514941e83ae7c7eb4485e6d2fca60cfa1f50d4f42c41f2)
+set(IGC_VCINTRINSICS_HASH e5acd5626ce7fa6d41ce154c50ac805eda734ee66af94ef28e680ac2ad81bb9f)
 set(IGC_VCINTRINSICS_HASH_TYPE SHA256)
 set(IGC_VCINTRINSICS_FILE vc-intrinsics-${IGC_VCINTRINSICS_VERSION}.tar.gz)
@@ -714,9 +714,9 @@ set(IGC_SPIRV_TOOLS_HASH 6e19900e948944243024aedd0a201baf3854b377b9cc7a386553bc1
 set(IGC_SPIRV_TOOLS_HASH_TYPE SHA256)
 set(IGC_SPIRV_TOOLS_FILE SPIR-V-Tools-${IGC_SPIRV_TOOLS_VERSION}.tar.gz)
-set(IGC_SPIRV_TRANSLATOR_VERSION a31ffaeef77e23d500b3ea3d35e0c42ff5648ad9)
+set(IGC_SPIRV_TRANSLATOR_VERSION d739c01d65ec00dee64dedd40deed805216a7193)
 set(IGC_SPIRV_TRANSLATOR_URI https://github.com/KhronosGroup/SPIRV-LLVM-Translator/archive/${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
-set(IGC_SPIRV_TRANSLATOR_HASH 9e26c96a45341b8f8af521bacea20e752623346340addd02af95d669f6e89252)
+set(IGC_SPIRV_TRANSLATOR_HASH ddc0cc9ccbe59dadeaf291012d59de142b2e9f2b124dbb634644d39daddaa13e)
 set(IGC_SPIRV_TRANSLATOR_HASH_TYPE SHA256)
 set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
@@ -724,15 +724,15 @@ set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.
 ### Intel Graphics Compiler DEPS END ###
 ########################################
-set(GMMLIB_VERSION intel-gmmlib-22.1.8)
+set(GMMLIB_VERSION intel-gmmlib-22.3.0)
 set(GMMLIB_URI https://github.com/intel/gmmlib/archive/refs/tags/${GMMLIB_VERSION}.tar.gz)
-set(GMMLIB_HASH bf23e9a3742b4fb98c7666c9e9b29f3219e4b2fb4d831aaf4eed71f5e2d17368)
+set(GMMLIB_HASH c1f33e1519edfc527127baeb0436b783430dfd256c643130169a3a71dc86aff9)
 set(GMMLIB_HASH_TYPE SHA256)
 set(GMMLIB_FILE ${GMMLIB_VERSION}.tar.gz)
-set(OCLOC_VERSION 22.38.24278)
+set(OCLOC_VERSION 22.49.25018.21)
 set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC_VERSION}.tar.gz)
-set(OCLOC_HASH db0c542fccd651e6404b15a74d46027f1ce0eda8dc9e25a40cbb6c0faef257ee)
+set(OCLOC_HASH 92362dae08b503a34e5d3820ed284198c452bcd5e7504d90eb69887b20492c06)
 set(OCLOC_HASH_TYPE SHA256)
 set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)
--- a/build_files/build_environment/cmake/wayland.cmake
+++ b/build_files/build_environment/cmake/wayland.cmake
@@ -13,7 +13,7 @@ ExternalProject_Add(external_wayland
  # NOTE: `-lm` is needed for `libxml2` which is a static library that uses `libm.so`,
  # without this, math symbols such as `floor` aren't found.
  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env PKG_CONFIG_PATH=${LIBDIR}/expat/lib/pkgconfig:${LIBDIR}/xml2/lib/pkgconfig:${LIBDIR}/ffi/lib/pkgconfig:$PKG_CONFIG_PATH
-                    ${MESON} --prefix ${LIBDIR}/wayland -Ddocumentation=false -Dtests=false -D "c_link_args=-L${LIBDIR}/ffi/lib -lm" . ../external_wayland
+                    ${MESON} --prefix ${LIBDIR}/wayland ${MESON_BUILD_TYPE} -Ddocumentation=false -Dtests=false -D "c_link_args=-L${LIBDIR}/ffi/lib -lm" . ../external_wayland
  BUILD_COMMAND ninja
  INSTALL_COMMAND ninja install
 )
--- a/build_files/build_environment/cmake/wayland_protocols.cmake
+++ b/build_files/build_environment/cmake/wayland_protocols.cmake
@@ -7,7 +7,7 @@ ExternalProject_Add(external_wayland_protocols
  PREFIX ${BUILD_DIR}/wayland-protocols
  # Use `-E` so the `PKG_CONFIG_PATH` can be defined to link against our own WAYLAND.
  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env PKG_CONFIG_PATH=${LIBDIR}/wayland/lib64/pkgconfig:$PKG_CONFIG_PATH
-                    ${MESON} --prefix ${LIBDIR}/wayland-protocols . ../external_wayland_protocols -Dtests=false
+                    ${MESON} --prefix ${LIBDIR}/wayland-protocols ${MESON_BUILD_TYPE} . ../external_wayland_protocols -Dtests=false
  BUILD_COMMAND ninja
  INSTALL_COMMAND ninja install
 )
--- a/build_files/build_environment/cmake/xvidcore.cmake
+++ b/build_files/build_environment/cmake/xvidcore.cmake
@@ -17,11 +17,13 @@ ExternalProject_Add(external_xvidcore
  INSTALL_DIR ${LIBDIR}/xvidcore
 )
-ExternalProject_Add_Step(external_xvidcore after_install
+if(WIN32)
-  COMMAND ${CMAKE_COMMAND} -E rename ${LIBDIR}/xvidcore/lib/xvidcore.a ${LIBDIR}/xvidcore/lib/libxvidcore.a || true
+  ExternalProject_Add_Step(external_xvidcore after_install
-  COMMAND ${CMAKE_COMMAND} -E remove ${LIBDIR}/xvidcore/lib/xvidcore.dll.a
+    COMMAND ${CMAKE_COMMAND} -E rename ${LIBDIR}/xvidcore/lib/xvidcore.a ${LIBDIR}/xvidcore/lib/libxvidcore.a || true
-  DEPENDEES install
+    COMMAND ${CMAKE_COMMAND} -E remove ${LIBDIR}/xvidcore/lib/xvidcore.dll.a
-)
+    DEPENDEES install
  )
 endif()
 if(MSVC)
  set_target_properties(external_xvidcore PROPERTIES FOLDER Mingw)
--- a/build_files/build_environment/linux/make_deps_wrapper.sh
+++ b/build_files/build_environment/linux/make_deps_wrapper.sh
@@ -0,0 +1,74 @@
 #!/usr/bin/env bash
 # SPDX-License-Identifier: GPL-2.0-or-later
 # This script ensures:
 # - One dependency is built at a time.
 # - That dependency uses all available cores.
 #
 # Without this, simply calling `make -j$(nproc)` from the `${CMAKE_BUILD_DIR}/deps/`
 # directory will build many projects at once.
 #
 # This is undesirable for the following reasons:
 #
 # - The output from projects is mixed together,
 #   making it difficult to track down the cause of a build failure.
 #
 # - Larger dependencies such as LLVM can bottleneck the build process,
 #   making it necessary to cancel the build and manually run build commands in each directory.
 #
 # - Building many projects at once means canceling (Control-C) can lead to the build being in an undefined state.
 #   It's possible canceling happens as a patch is being applied or files are being copied.
 #   (steps that aren't part of the compilation process where it's typically safe to cancel).
 if [[ -z "$MY_MAKE_CALL_LEVEL" ]]; then
  export MY_MAKE_CALL_LEVEL=0
  export MY_MAKEFLAGS=$MAKEFLAGS
  # Extract the jobs argument (`-jN`, `-j N`, `--jobs=N`).
  add_next=0
  for i in "$@"; do
    case $i in
      -j*)
        export MY_JOBS_ARG=$i
        if [ "$MY_JOBS_ARG" = "-j" ]; then
          add_next=1
        fi
        ;;
      --jobs=*)
        shift # past argument=value
        MY_JOBS_ARG=$i
        ;;
      *)
        if (( add_next == 1 )); then
          MY_JOBS_ARG="$MY_JOBS_ARG $i"
          add_next=0
        fi
        ;;
    esac
  done
  unset i add_next
  if [[ -z "$MY_JOBS_ARG" ]]; then
    MY_JOBS_ARG="-j$(nproc)"
  fi
  export MY_JOBS_ARG
  # Support user defined `MAKEFLAGS`.
  export MAKEFLAGS="$MY_MAKEFLAGS -j1"
 else
  export MY_MAKE_CALL_LEVEL=$(( MY_MAKE_CALL_LEVEL + 1 ))
  if (( MY_MAKE_CALL_LEVEL == 1 )); then
    # Important to set jobs to 1, otherwise user defined jobs argument is used.
    export MAKEFLAGS="$MY_MAKEFLAGS -j1"
  elif (( MY_MAKE_CALL_LEVEL == 2 )); then
    # This is the level used by each sub-project.
    export MAKEFLAGS="$MY_MAKEFLAGS $MY_JOBS_ARG"
  fi
  # Else leave `MY_MAKEFLAGS` flags as-is, avoids setting a high number of jobs on recursive
  # calls (which may easily run out of memory). Let the job-server handle the rest.
 fi
 # Useful for troubleshooting the wrapper.
 # echo "Call level: $MY_MAKE_CALL_LEVEL, args=$@".
 # Call actual make but ensure recursive calls run via this script.
 exec make MAKE="$0" "$@"
--- a/build_files/build_environment/patches/igc_opencl_clang.diff
+++ b/build_files/build_environment/patches/igc_opencl_clang.diff
@@ -1,7 +1,7 @@
 diff -Naur external_igc_opencl_clang.orig/CMakeLists.txt external_igc_opencl_clang/CMakeLists.txt
 --- external_igc_opencl_clang.orig/CMakeLists.txt	2022-03-16 05:51:10 -0600
 +++ external_igc_opencl_clang/CMakeLists.txt	2022-05-23 10:40:09 -0600
-@@ -126,22 +126,24 @@
+@@ -147,22 +147,24 @@
         )
     endif()
--- a/build_files/build_environment/patches/openvdb_metal.diff
+++ b/build_files/build_environment/patches/openvdb_metal.diff
--- a/build_files/cmake/buildinfo.cmake
+++ b/build_files/cmake/buildinfo.cmake
@@ -23,19 +23,19 @@ if(EXISTS ${SOURCE_DIR}/.git)
  if(MY_WC_BRANCH STREQUAL "HEAD")
    # Detached HEAD, check whether commit hash is reachable
-    # in the master branch
+    # in the main branch
    execute_process(COMMAND git rev-parse --short=12 HEAD
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE MY_WC_HASH
                    OUTPUT_STRIP_TRAILING_WHITESPACE)
-    execute_process(COMMAND git branch --list master blender-v* --contains ${MY_WC_HASH}
+    execute_process(COMMAND git branch --list main blender-v* --contains ${MY_WC_HASH}
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE _git_contains_check
                    OUTPUT_STRIP_TRAILING_WHITESPACE)
    if(NOT _git_contains_check STREQUAL "")
-      set(MY_WC_BRANCH "master")
+      set(MY_WC_BRANCH "main")
    else()
      execute_process(COMMAND git show-ref --tags -d
                      WORKING_DIRECTORY ${SOURCE_DIR}
@@ -48,7 +48,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
                      OUTPUT_STRIP_TRAILING_WHITESPACE)
      if(_git_tag_hashes MATCHES "${_git_head_hash}")
-        set(MY_WC_BRANCH "master")
+        set(MY_WC_BRANCH "main")
      else()
        execute_process(COMMAND git branch --contains ${MY_WC_HASH}
                        WORKING_DIRECTORY ${SOURCE_DIR}
--- a/build_files/cmake/config/blender_release.cmake
+++ b/build_files/cmake/config/blender_release.cmake
@@ -85,7 +85,7 @@ if(NOT APPLE)
  set(WITH_CYCLES_DEVICE_OPTIX    ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUDA_BINARIES   ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUBIN_COMPILER  OFF CACHE BOOL "" FORCE)
-  set(WITH_CYCLES_HIP_BINARIES    ON  CACHE BOOL "" FORCE)
+  set(WITH_CYCLES_HIP_BINARIES    OFF CACHE BOOL "" FORCE)
  set(WITH_CYCLES_DEVICE_ONEAPI   ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_ONEAPI_BINARIES ON  CACHE BOOL "" FORCE)
 endif()
--- a/build_files/cmake/example_scripts/cmake_linux_install.sh
+++ b/build_files/cmake/example_scripts/cmake_linux_install.sh
@@ -11,11 +11,11 @@
 mkdir ~/blender-git
 cd ~/blender-git
-git clone http://git.blender.org/blender.git
+git clone https://projects.blender.org/blender/blender.git
 cd blender
 git submodule update --init --recursive
-git submodule foreach git checkout master
+git submodule foreach git checkout main
-git submodule foreach git pull --rebase origin master
+git submodule foreach git pull --rebase origin main
 # create build dir
 mkdir ~/blender-git/build-cmake
@@ -35,7 +35,7 @@ ln -s ~/blender-git/build-cmake/bin/blender ~/blender-git/blender/blender.bin
 echo ""
 echo "* Useful Commands *"
 echo "   Run Blender: ~/blender-git/blender/blender.bin"
-echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin master"
+echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin main"
 echo "   Reconfigure Blender: cd ~/blender-git/build-cmake ; cmake ."
 echo "   Build Blender: cd ~/blender-git/build-cmake ; make"
 echo ""
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -544,7 +544,7 @@ endfunction()
 function(setup_platform_linker_libs
  target
  )
-  # jemalloc must be early in the list, to be before pthread (see T57998)
+  # jemalloc must be early in the list, to be before pthread (see #57998).
  if(WITH_MEM_JEMALLOC)
    target_link_libraries(${target} ${JEMALLOC_LIBRARIES})
  endif()
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -440,7 +440,7 @@ string(APPEND PLATFORM_LINKFLAGS " -stdlib=libc++")
 # Make stack size more similar to Embree, required for Embree.
 string(APPEND PLATFORM_LINKFLAGS_EXECUTABLE " -Wl,-stack_size,0x100000")
-# Suppress ranlib "has no symbols" warnings (workaround for T48250)
+# Suppress ranlib "has no symbols" warnings (workaround for #48250).
 set(CMAKE_C_ARCHIVE_CREATE   "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
 set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
 # llvm-ranlib doesn't support this flag. Xcode's libtool does.
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -121,7 +121,7 @@ if(WITH_WINDOWS_BUNDLE_CRT)
  include(InstallRequiredSystemLibraries)
  # ucrtbase(d).dll cannot be in the manifest, due to the way windows 10 handles
-  # redirects for this dll, for details see T88813.
+  # redirects for this dll, for details see #88813.
  foreach(lib ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS})
    string(FIND ${lib} "ucrtbase" pos)
    if(NOT pos EQUAL -1)
@@ -295,7 +295,7 @@ unset(MATERIALX_LIB_FOLDER_EXISTS)
 if(NOT MSVC_CLANG                  AND # Available with MSVC 15.7+ but not for CLANG.
   NOT WITH_WINDOWS_SCCACHE        AND # And not when sccache is enabled
   NOT VS_CLANG_TIDY)                  # Clang-tidy does not like these options
-  add_compile_options(/experimental:external /external:templates- /external:I "${LIBDIR}" /external:W0)
+  add_compile_options(/experimental:external /external:I "${LIBDIR}" /external:W0)
 endif()
 # Add each of our libraries to our cmake_prefix_path so find_package() could work
@@ -901,11 +901,11 @@ endif()
 if(WINDOWS_PYTHON_DEBUG)
  # Include the system scripts in the blender_python_system_scripts project.
-  file(GLOB_RECURSE inFiles "${CMAKE_SOURCE_DIR}/release/scripts/*.*" )
+  file(GLOB_RECURSE inFiles "${CMAKE_SOURCE_DIR}/scripts/*.*" )
  add_custom_target(blender_python_system_scripts SOURCES ${inFiles})
  foreach(_source IN ITEMS ${inFiles})
    get_filename_component(_source_path "${_source}" PATH)
-    string(REPLACE "${CMAKE_SOURCE_DIR}/release/scripts/" "" _source_path "${_source_path}")
+    string(REPLACE "${CMAKE_SOURCE_DIR}/scripts/" "" _source_path "${_source_path}")
    string(REPLACE "/" "\\" _group_path "${_source_path}")
    source_group("${_group_path}" FILES "${_source}")
  endforeach()
@@ -940,7 +940,7 @@ if(WINDOWS_PYTHON_DEBUG)
    file(WRITE ${USER_PROPS_FILE} "<?xml version=\"1.0\" encoding=\"utf-8\"?>
 <Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">
  <PropertyGroup>
-    <LocalDebuggerCommandArguments>-con --env-system-scripts \"${CMAKE_SOURCE_DIR}/release/scripts\" </LocalDebuggerCommandArguments>
+    <LocalDebuggerCommandArguments>-con --env-system-scripts \"${CMAKE_SOURCE_DIR}/scripts\" </LocalDebuggerCommandArguments>
  </PropertyGroup>
 </Project>")
  endif()
--- a/build_files/cmake/project_info.py
+++ b/build_files/cmake/project_info.py
@@ -142,7 +142,7 @@ def cmake_advanced_info() -> Union[Tuple[List[str], List[Tuple[str, str]]], Tupl
    make_exe = cmake_cache_var("CMAKE_MAKE_PROGRAM")
    if make_exe is None:
-        print("Make command not found in: %r not found" % project_path)
+        print("Make command not found: CMAKE_MAKE_PROGRAM")
        return None, None
    make_exe_basename = os.path.basename(make_exe)
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -1,53 +1,3 @@
 #
 # Used by Buildbot build pipeline make_update.py script only for now
 # We intended to update the make_update.py in the branches to use this file eventually
 #
 update-code:
    git:
        submodules:
        -   branch: master
            commit_id: HEAD
            path: release/scripts/addons
        -   branch: master
            commit_id: HEAD
            path: release/scripts/addons_contrib
        -   branch: master
            commit_id: HEAD
            path: release/datafiles/locale
        -   branch: master
            commit_id: HEAD
            path: source/tools
    svn:
        libraries:
            darwin-arm64:
                branch: trunk
                commit_id: HEAD
                path: lib/darwin_arm64
            darwin-x86_64:
                branch: trunk
                commit_id: HEAD
                path: lib/darwin
            linux-x86_64:
                branch: trunk
                commit_id: HEAD
                path: lib/linux_x86_64_glibc_228
            windows-amd64:
                branch: trunk
                commit_id: HEAD
                path: lib/win64_vc15
        tests:
            branch: trunk
            commit_id: HEAD
            path: lib/tests
        benchmarks:
            branch: trunk
            commit_id: HEAD
            path: lib/benchmarks
        assets:
            branch: trunk
            commit_id: HEAD
            path: lib/assets
 #
 # Buildbot only configs
 #
--- a/build_files/utils/make_bpy_wheel.py
+++ b/build_files/utils/make_bpy_wheel.py
@@ -58,7 +58,7 @@ Each Blender release supports one Python version, and the package is only compat
 ## Source Code
 * [Releases](https://download.blender.org/source/)
-* Repository: [git.blender.org/blender.git](https://git.blender.org/gitweb/gitweb.cgi/blender.git)
+* Repository: [projects.blender.org/blender/blender.git](https://projects.blender.org/blender/blender)
 ## Credits
--- a/build_files/utils/make_source_archive.py
+++ b/build_files/utils/make_source_archive.py
@@ -135,7 +135,7 @@ def submodules_to_manifest(
        submodule = line.split()[1]
        # Don't use native slashes as GIT for MS-Windows outputs forward slashes.
-        if skip_addon_contrib and submodule == "release/scripts/addons_contrib":
+        if skip_addon_contrib and submodule == "scripts/addons_contrib":
            continue
        for path in git_ls_files(blender_srcdir / submodule):
--- a/build_files/utils/make_update.py
+++ b/build_files/utils/make_update.py
@@ -16,14 +16,28 @@ import shutil
 import sys
 import make_utils
 from pathlib import Path
 from make_utils import call, check_output
 from urllib.parse import urljoin
 from typing import (
    List,
    Iterable,
    Optional,
 )
 class Submodule:
    path: str
    branch: str
    branch_fallback: str
    def __init__(self, path: str, branch: str, branch_fallback: str) -> None:
        self.path = path
        self.branch = branch
        self.branch_fallback = branch_fallback
 def print_stage(text: str) -> None:
    print("")
    print(text)
@@ -42,6 +56,7 @@ def parse_arguments() -> argparse.Namespace:
    parser.add_argument("--svn-branch", default=None)
    parser.add_argument("--git-command", default="git")
    parser.add_argument("--use-linux-libraries", action="store_true")
    parser.add_argument("--architecture", type=str, choices=("x86_64", "amd64", "arm64",))
    return parser.parse_args()
@@ -51,6 +66,19 @@ def get_blender_git_root() -> str:
 # Setup for precompiled libraries and tests from svn.
 def get_effective_architecture(args: argparse.Namespace) -> str:
    architecture = args.architecture
    if architecture:
        assert isinstance(architecture, str)
        return architecture
    # Check platform.version to detect arm64 with x86_64 python binary.
    if "ARM64" in platform.version():
        return "arm64"
    return platform.machine().lower()
 def svn_update(args: argparse.Namespace, release_version: Optional[str]) -> None:
    svn_non_interactive = [args.svn_command, '--non-interactive']
@@ -58,11 +86,11 @@ def svn_update(args: argparse.Namespace, release_version: Optional[str]) -> None
    svn_url = make_utils.svn_libraries_base_url(release_version, args.svn_branch)
    # Checkout precompiled libraries
    architecture = get_effective_architecture(args)
    if sys.platform == 'darwin':
-        # Check platform.version to detect arm64 with x86_64 python binary.
+        if architecture == 'arm64':
        if platform.machine() == 'arm64' or ('ARM64' in platform.version()):
            lib_platform = "darwin_arm64"
-        elif platform.machine() == 'x86_64':
+        elif architecture == 'x86_64':
            lib_platform = "darwin"
        else:
            lib_platform = None
@@ -170,7 +198,7 @@ def git_update_skip(args: argparse.Namespace, check_remote_exists: bool = True)
        return "rebase or merge in progress, complete it first"
    # Abort if uncommitted changes.
-    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no'])
+    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no', '--ignore-submodules'])
    if len(changes) != 0:
        return "you have unstaged changes"
@@ -184,97 +212,282 @@ def git_update_skip(args: argparse.Namespace, check_remote_exists: bool = True)
    return ""
 def use_upstream_workflow(args: argparse.Namespace) -> bool:
    return make_utils.git_remote_exist(args.git_command, "upstream")
 def work_tree_update_upstream_workflow(args: argparse.Namespace, use_fetch=True) -> str:
    """
    Update the Blender repository using the Github style of fork organization
    Returns true if the current local branch has been updated to the upstream state.
    Otherwise false is returned.
    """
    branch_name = make_utils.git_branch(args.git_command)
    if use_fetch:
        call((args.git_command, "fetch", "upstream"))
    upstream_branch = f"upstream/{branch_name}"
    if not make_utils.git_branch_exists(args.git_command, upstream_branch):
        return "no_branch"
    retcode = call((args.git_command, "merge", "--ff-only", upstream_branch), exit_on_error=False)
    if retcode != 0:
        return "Unable to fast forward\n"
    return ""
 def work_tree_update(args: argparse.Namespace, use_fetch=True) -> str:
    """
    Update the Git working tree using the best strategy
    This function detects whether it is a github style of fork remote organization is used, or
    is it a repository which origin is an upstream.
    """
    if use_upstream_workflow(args):
        message = work_tree_update_upstream_workflow(args, use_fetch)
        if message != "no_branch":
            return message
        # If there is upstream configured but the local branch is not in the upstream, try to
        # update the branch from the fork.
    update_command = [args.git_command, "pull", "--rebase"]
    call(update_command)
    return ""
 # Update blender repository.
-def blender_update(args: argparse.Namespace) -> None:
+def blender_update(args: argparse.Namespace) -> str:
    print_stage("Updating Blender Git Repository")
-    call([args.git_command, "pull", "--rebase"])
+
    return work_tree_update(args)
-# Update submodules.
+def resolve_external_url(blender_url: str, repo_name: str) -> str:
-def submodules_update(
+    return urljoin(blender_url + "/", "../" + repo_name)
        args: argparse.Namespace,
        release_version: Optional[str],
        branch: Optional[str],
 ) -> str:
    print_stage("Updating Submodules")
    if make_utils.command_missing(args.git_command):
        sys.stderr.write("git not found, can't update code\n")
        sys.exit(1)
-    # Update submodules to appropriate given branch,
+
-    # falling back to master if none is given and/or found in a sub-repository.
+def external_script_copy_old_submodule_over(args: argparse.Namespace, directory_name: str) -> None:
-    branch_fallback = "master"
+    blender_git_root = Path(get_blender_git_root())
    scripts_dir = blender_git_root / "scripts"
    external_dir = scripts_dir / directory_name
    old_submodule_relative_dir = Path("release") / "scripts" / directory_name
    print(f"Moving {old_submodule_relative_dir} to scripts/{directory_name} ...")
    old_submodule_dir = blender_git_root / old_submodule_relative_dir
    shutil.move(old_submodule_dir, external_dir)
    # Remove old ".git" which is a file with path to a submodule bare repo inside of main
    # repo .git/modules directory.
    (external_dir / ".git").unlink()
    bare_repo_relative_dir = Path(".git") / "modules" / "release" / "scripts" / directory_name
    print(f"Copying {bare_repo_relative_dir} to scripts/{directory_name}/.git ...")
    bare_repo_dir = blender_git_root / bare_repo_relative_dir
    shutil.copytree(bare_repo_dir, external_dir / ".git")
    git_config = external_dir / ".git" / "config"
    call((args.git_command, "config", "--file", git_config, "--unset", "core.worktree"))
 def external_script_initialize_if_needed(args: argparse.Namespace,
                                         repo_name: str,
                                         directory_name: str) -> None:
    """Initialize checkout of an external repository scripts directory"""
    blender_git_root = Path(get_blender_git_root())
    blender_dot_git = blender_git_root / ".git"
    scripts_dir = blender_git_root / "scripts"
    external_dir = scripts_dir / directory_name
    if external_dir.exists():
        return
    print(f"Initializing scripts/{directory_name} ...")
    old_submodule_dot_git = blender_git_root / "release" / "scripts" / directory_name / ".git"
    if old_submodule_dot_git.exists() and blender_dot_git.is_dir():
        external_script_copy_old_submodule_over(args, directory_name)
        return
    origin_name = "upstream" if use_upstream_workflow(args) else "origin"
    blender_url = make_utils.git_get_remote_url(args.git_command, origin_name)
    external_url = resolve_external_url(blender_url, repo_name)
    call((args.git_command, "clone", "--origin", origin_name, external_url, external_dir))
 def external_script_add_origin_if_needed(args: argparse.Namespace,
                                         repo_name: str,
                                         directory_name: str) -> str:
    """
    Add remote called 'origin' if there is a fork of the external repository available
    This is only done when using Github style upstream workflow in the main repository.
    """
    if not use_upstream_workflow(args):
        return ""
    cwd = os.getcwd()
    blender_git_root = Path(get_blender_git_root())
    scripts_dir = blender_git_root / "scripts"
    external_dir = scripts_dir / directory_name
    origin_blender_url = make_utils.git_get_remote_url(args.git_command, "origin")
    origin_external_url = resolve_external_url(origin_blender_url, repo_name)
    try:
        os.chdir(external_dir)
        if (make_utils.git_remote_exist(args.git_command, "origin") or
                not make_utils.git_remote_exist(args.git_command, "upstream")):
            return
        if not make_utils.git_is_remote_repository(args.git_command, origin_external_url):
            return
        print(f"Adding origin remote to {directory_name} pointing to fork ...")
        # Non-obvious tricks to introduce the new remote called "origin" to the existing
        # submodule configuration.
        #
        # This is all within the content of creating a fork of a submodule after `make update`
        # has been run and possibly local branches tracking upstream were added.
        #
        # The idea here goes as following:
        #
        #  - Rename remote "upstream" to "origin", which takes care of changing the names of
        #    remotes the local branches are tracking.
        #
        #  - Change the URL to the "origin", which so was was still pointing to upstream.
        #
        #  - Re-introduce the "upstream" remote, with the same URL as it had prior to rename.
        upstream_url = make_utils.git_get_remote_url(args.git_command, "upstream")
        call((args.git_command, "remote", "rename", "upstream", "origin"))
        make_utils.git_set_config(args.git_command, f"remote.origin.url", origin_external_url)
        call((args.git_command, "remote", "add", "upstream", upstream_url))
    finally:
        os.chdir(cwd)
    return ""
 def external_scripts_update(args: argparse.Namespace,
                            repo_name: str,
                            directory_name: str,
                            branch: Optional[str]) -> str:
    """Update a single external checkout with the given name in the scripts folder"""
    external_script_initialize_if_needed(args, repo_name, directory_name)
    external_script_add_origin_if_needed(args, repo_name, directory_name)
    print(f"Updating scripts/{directory_name} ...")
    cwd = os.getcwd()
    blender_git_root = Path(get_blender_git_root())
    scripts_dir = blender_git_root / "scripts"
    external_dir = scripts_dir / directory_name
    # Update externals to appropriate given branch, falling back to main if none is given and/or
    # found in a sub-repository.
    branch_fallback = "main"
    if not branch:
        branch = branch_fallback
    submodules = [
        ("release/scripts/addons", branch, branch_fallback),
        ("release/scripts/addons_contrib", branch, branch_fallback),
        ("release/datafiles/locale", branch, branch_fallback),
        ("source/tools", branch, branch_fallback),
    ]
    # Initialize submodules only if needed.
    for submodule_path, submodule_branch, submodule_branch_fallback in submodules:
        if not os.path.exists(os.path.join(submodule_path, ".git")):
            call([args.git_command, "submodule", "update", "--init", "--recursive"])
            break
    # Checkout appropriate branch and pull changes.
    skip_msg = ""
    for submodule_path, submodule_branch, submodule_branch_fallback in submodules:
        cwd = os.getcwd()
        try:
            os.chdir(submodule_path)
            msg = git_update_skip(args, check_remote_exists=False)
            if msg:
                skip_msg += submodule_path + " skipped: " + msg + "\n"
            else:
                # Find a matching branch that exists.
                call([args.git_command, "fetch", "origin"])
                if make_utils.git_branch_exists(args.git_command, submodule_branch):
                    pass
                elif make_utils.git_branch_exists(args.git_command, submodule_branch_fallback):
                    submodule_branch = submodule_branch_fallback
                else:
                    # Skip.
                    submodule_branch = ""
-                # Switch to branch and pull.
+    try:
-                if submodule_branch:
+        os.chdir(external_dir)
-                    if make_utils.git_branch(args.git_command) != submodule_branch:
+        msg = git_update_skip(args, check_remote_exists=False)
-                        call([args.git_command, "checkout", submodule_branch])
+        if msg:
-                    call([args.git_command, "pull", "--rebase", "origin", submodule_branch])
+            skip_msg += directory_name + " skipped: " + msg + "\n"
-        finally:
+        else:
-            os.chdir(cwd)
+            # Find a matching branch that exists.
            for remote in ("origin", "upstream"):
                if make_utils.git_remote_exist(args.git_command, remote):
                    call([args.git_command, "fetch", remote])
            submodule_branch = branch
            if make_utils.git_branch_exists(args.git_command, submodule_branch):
                pass
            elif make_utils.git_branch_exists(args.git_command, branch_fallback):
                submodule_branch = branch_fallback
            else:
                # Skip.
                submodule_branch = ""
            # Switch to branch and pull.
            if submodule_branch:
                if make_utils.git_branch(args.git_command) != submodule_branch:
                    call([args.git_command, "checkout", submodule_branch])
                # Don't use extra fetch since all remotes of interest have been already fetched
                # some lines above.
                skip_msg += work_tree_update(args, use_fetch=False)
    finally:
        os.chdir(cwd)
    return skip_msg
 def scripts_submodules_update(args: argparse.Namespace, branch: Optional[str]) -> str:
    """Update working trees of addons and addons_contrib within the scripts/ directory"""
    msg = ""
    msg += external_scripts_update(args, "blender-addons", "addons", branch)
    msg += external_scripts_update(args, "blender-addons-contrib", "addons_contrib", branch)
    return msg
 def submodules_update(args: argparse.Namespace, branch: Optional[str]) -> str:
    """Update submodules or other externally tracked source trees"""
    msg = ""
    msg += scripts_submodules_update(args, branch)
    return msg
 if __name__ == "__main__":
    args = parse_arguments()
    blender_skip_msg = ""
    submodules_skip_msg = ""
-    # Test if we are building a specific release version.
+    blender_version = make_utils. parse_blender_version()
-    branch = make_utils.git_branch(args.git_command)
+    if blender_version.cycle != 'alpha':
-    if branch == 'HEAD':
+        major = blender_version.version // 100
-        sys.stderr.write('Blender git repository is in detached HEAD state, must be in a branch\n')
+        minor = blender_version.version % 100
-        sys.exit(1)
+        branch = f"blender-v{major}.{minor}-release"
-
+        release_version: Optional[str] = f"{major}.{minor}"
-    tag = make_utils.git_tag(args.git_command)
+    else:
-    release_version = make_utils.git_branch_release_version(branch, tag)
+        branch = 'main'
        release_version = None
    if not args.no_libraries:
        svn_update(args, release_version)
    if not args.no_blender:
        blender_skip_msg = git_update_skip(args)
        if not blender_skip_msg:
            blender_skip_msg = blender_update(args)
        if blender_skip_msg:
            blender_skip_msg = "Blender repository skipped: " + blender_skip_msg + "\n"
        else:
            blender_update(args)
    if not args.no_submodules:
-        submodules_skip_msg = submodules_update(args, release_version, branch)
+        submodules_skip_msg = submodules_update(args, branch)
    # Report any skipped repositories at the end, so it's not as easy to miss.
    skip_msg = blender_skip_msg + submodules_skip_msg
--- a/build_files/utils/make_utils.py
+++ b/build_files/utils/make_utils.py
@@ -9,7 +9,9 @@ import re
 import shutil
 import subprocess
 import sys
 import os
 from pathlib import Path
 from urllib.parse import urljoin
 from typing import (
    Sequence,
@@ -19,7 +21,7 @@ from typing import (
 def call(cmd: Sequence[str], exit_on_error: bool = True, silent: bool = False) -> int:
    if not silent:
-        print(" ".join(cmd))
+        print(" ".join([str(x) for x in cmd]))
    # Flush to ensure correct order output on Windows.
    sys.stdout.flush()
@@ -55,10 +57,48 @@ def check_output(cmd: Sequence[str], exit_on_error: bool = True) -> str:
 def git_branch_exists(git_command: str, branch: str) -> bool:
    return (
        call([git_command, "rev-parse", "--verify", branch], exit_on_error=False, silent=True) == 0 or
        call([git_command, "rev-parse", "--verify", "remotes/upstream/" + branch], exit_on_error=False, silent=True) == 0 or
        call([git_command, "rev-parse", "--verify", "remotes/origin/" + branch], exit_on_error=False, silent=True) == 0
    )
 def git_get_remote_url(git_command: str, remote_name: str) -> bool:
    return check_output((git_command, "ls-remote", "--get-url", remote_name))
 def git_remote_exist(git_command: str, remote_name: str) -> bool:
    """Check whether there is a remote with the given name"""
    # `git ls-remote --get-url upstream` will print an URL if there is such remote configured, and
    # otherwise will print "upstream".
    remote_url = check_output((git_command, "ls-remote", "--get-url", remote_name))
    return remote_url != remote_name
 def git_get_resolved_submodule_url(git_command: str, blender_url: str, submodule_path: str) -> str:
    git_root = check_output([git_command, "rev-parse", "--show-toplevel"])
    dot_gitmodules = os.path.join(git_root, ".gitmodules")
    submodule_key_prefix = f"submodule.{submodule_path}"
    submodule_key_url = f"{submodule_key_prefix}.url"
    gitmodule_url = git_get_config(
        git_command, submodule_key_url, file=dot_gitmodules)
    # A bit of a trickery to construct final URL.
    # Only works for the relative submodule URLs.
    #
    # Note that unless the LHS URL ends up with a slash urljoin treats the last component as a
    # file.
    assert gitmodule_url.startswith('..')
    return urljoin(blender_url + "/", gitmodule_url)
 def git_is_remote_repository(git_command: str, repo: str) -> bool:
    """Returns true if the given repository is a valid/clonable git repo"""
    exit_code = call((git_command, "ls-remote", repo, "HEAD"), exit_on_error=False, silent=True)
    return exit_code == 0
 def git_branch(git_command: str) -> str:
    # Get current branch name.
    try:
@@ -70,6 +110,20 @@ def git_branch(git_command: str) -> str:
    return branch.strip().decode('utf8')
 def git_get_config(git_command: str, key: str, file: Optional[str] = None) -> str:
    if file:
        return check_output([git_command, "config", "--file", file, "--get", key])
    return check_output([git_command, "config", "--get", key])
 def git_set_config(git_command: str, key: str, value: str, file: Optional[str] = None) -> str:
    if file:
        return check_output([git_command, "config", "--file", file, key, value])
    return check_output([git_command, "config", key, value])
 def git_tag(git_command: str) -> Optional[str]:
    # Get current tag name.
    try:
--- a/build_files/windows/check_submodules.cmd
+++ b/build_files/windows/check_submodules.cmd
@@ -3,9 +3,9 @@ if NOT exist "%BLENDER_DIR%\source\tools\.git" (
 	if not "%GIT%" == "" (
 		"%GIT%" submodule update --init --recursive --progress
 		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git checkout master
+		"%GIT%" submodule foreach git checkout main
 		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git pull --rebase origin master
+		"%GIT%" submodule foreach git pull --rebase origin main
 		if errorlevel 1 goto FAIL
 		goto EOF
 	) else (
--- a/build_files/windows/show_hashes.cmd
+++ b/build_files/windows/show_hashes.cmd
@@ -4,9 +4,9 @@ if "%GIT%" == "" (
 )
 cd "%BLENDER_DIR%"
 for /f "delims=" %%i in ('"%GIT%" rev-parse HEAD') do echo Branch_hash=%%i
-cd "%BLENDER_DIR%/release/datafiles/locale"
+cd "%BLENDER_DIR%/locale"
 for /f "delims=" %%i in ('"%GIT%" rev-parse HEAD') do echo Locale_hash=%%i
-cd "%BLENDER_DIR%/release/scripts/addons"
+cd "%BLENDER_DIR%/scripts/addons"
 for /f "delims=" %%i in ('"%GIT%" rev-parse HEAD') do echo Addons_Hash=%%i
 cd "%BLENDER_DIR%"
 :EOF
--- a/doc/doxygen/Doxyfile
+++ b/doc/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = Blender
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
-PROJECT_NUMBER         = V3.5
+PROJECT_NUMBER         = V3.6
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
--- a/doc/python_api/examples/blf.py
+++ b/doc/python_api/examples/blf.py
@@ -37,7 +37,7 @@ def draw_callback_px(self, context):
    # BLF drawing routine
    font_id = font_info["font_id"]
    blf.position(font_id, 2, 80, 0)
-    blf.size(font_id, 50, 72)
+    blf.size(font_id, 50)
    blf.draw(font_id, "Hello World")
--- a/doc/python_api/rst/include__bmesh.rst
+++ b/doc/python_api/rst/include__bmesh.rst
@@ -31,7 +31,7 @@ For an overview of BMesh data types and how they reference each other see:
 Example Script
 --------------
-.. literalinclude:: __/__/__/release/scripts/templates_py/bmesh_simple.py
+.. literalinclude:: __/__/__/scripts/templates_py/bmesh_simple.py
 Standalone Module
--- a/doc/python_api/rst/info_quickstart.rst
+++ b/doc/python_api/rst/info_quickstart.rst
@@ -288,7 +288,7 @@ In Python, this is done by defining a class, which is a subclass of an existing
 Example Operator
 ----------------
-.. literalinclude:: __/__/__/release/scripts/templates_py/operator_simple.py
+.. literalinclude:: __/__/__/scripts/templates_py/operator_simple.py
 Once this script runs, ``SimpleOperator`` is registered with Blender
 and can be called from Operator Search or added to the toolbar.
@@ -320,7 +320,7 @@ Example Panel
 Panels are registered as a class, like an operator.
 Notice the extra ``bl_`` variables used to set the context they display in.
-.. literalinclude:: __/__/__/release/scripts/templates_py/ui_panel_simple.py
+.. literalinclude:: __/__/__/scripts/templates_py/ui_panel_simple.py
 To run the script:
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -367,13 +367,13 @@ except ImportError:
 # Note that ".." is replaced by "__" in the RST files,
 # to avoid having to match Blender's source tree.
 EXTRA_SOURCE_FILES = (
-    "../../../release/scripts/templates_py/bmesh_simple.py",
+    "../../../scripts/templates_py/bmesh_simple.py",
-    "../../../release/scripts/templates_py/gizmo_operator.py",
+    "../../../scripts/templates_py/gizmo_operator.py",
-    "../../../release/scripts/templates_py/gizmo_operator_target.py",
+    "../../../scripts/templates_py/gizmo_operator_target.py",
-    "../../../release/scripts/templates_py/gizmo_simple.py",
+    "../../../scripts/templates_py/gizmo_simple.py",
-    "../../../release/scripts/templates_py/operator_simple.py",
+    "../../../scripts/templates_py/operator_simple.py",
-    "../../../release/scripts/templates_py/ui_panel_simple.py",
+    "../../../scripts/templates_py/ui_panel_simple.py",
-    "../../../release/scripts/templates_py/ui_previews_custom_icon.py",
+    "../../../scripts/templates_py/ui_previews_custom_icon.py",
    "../examples/bmesh.ops.1.py",
    "../examples/bpy.app.translations.py",
 )
@@ -476,7 +476,7 @@ MODULE_GROUPING = {
 # -------------------------------BLENDER----------------------------------------
-# converting bytes to strings, due to T30154
+# Converting bytes to strings, due to #30154.
 BLENDER_REVISION = str(bpy.app.build_hash, 'utf_8')
 BLENDER_REVISION_TIMESTAMP = bpy.app.build_commit_timestamp
@@ -487,7 +487,7 @@ BLENDER_VERSION_DOTS = "%d.%d" % (bpy.app.version[0], bpy.app.version[1])
 if BLENDER_REVISION != "Unknown":
    # SHA1 Git hash
    BLENDER_VERSION_HASH = BLENDER_REVISION
-    BLENDER_VERSION_HASH_HTML_LINK = "<a href=https://developer.blender.org/rB%s>%s</a>" % (
+    BLENDER_VERSION_HASH_HTML_LINK = "<a href=https://projects.blender.org/blender/blender/commit/%s>%s</a>" % (
        BLENDER_VERSION_HASH, BLENDER_VERSION_HASH,
    )
    BLENDER_VERSION_DATE = time.strftime("%d/%m/%Y", time.localtime(BLENDER_REVISION_TIMESTAMP))
@@ -647,7 +647,7 @@ def undocumented_message(module_name, type_name, identifier):
        module_name, type_name, identifier,
    )
-    return "Undocumented, consider `contributing <https://developer.blender.org/T51061>`__."
+    return "Undocumented, consider `contributing <https://developer.blender.org/>`__."
 def range_str(val):
@@ -1816,9 +1816,9 @@ def pyrna2sphinx(basepath):
    # operators
    def write_ops():
-        API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts"
+        API_BASEURL = "https://projects.blender.org/blender/blender/src/branch/main/scripts"
-        API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA"
+        API_BASEURL_ADDON = "https://projects.blender.org/blender/blender-addons"
-        API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC"
+        API_BASEURL_ADDON_CONTRIB = "https://projects.blender.org/blender/blender-addons-contrib"
        op_modules = {}
        op = None
@@ -2200,7 +2200,7 @@ def write_rst_enum_items(basepath, key, key_no_prefix, enum_items):
    Write a single page for a static enum in RST.
    This helps avoiding very large lists being in-lined in many places which is an issue
-    especially with icons in ``bpy.types.UILayout``. See T87008.
+    especially with icons in ``bpy.types.UILayout``. See #87008.
    """
    filepath = os.path.join(basepath, "%s.rst" % key_no_prefix)
    with open(filepath, "w", encoding="utf-8") as fh:
--- a/doc/python_api/static/js/version_switch.js
+++ b/doc/python_api/static/js/version_switch.js
@@ -156,7 +156,7 @@ var Popover = function() {
    },
    getNamed : function(v) {
      $.each(all_versions, function(ix, title) {
-        if (ix === "master" || ix === "latest") {
+        if (ix === "master" || ix === "main" || ix === "latest") {
          var m = title.match(/\d\.\d[\w\d\.]*/)[0];
          if (parseFloat(m) == v) {
            v = ix;
--- a/extern/hipew/README.blender
+++ b/extern/hipew/README.blender
@@ -1,5 +1,5 @@
 Project: Blender
-URL: https://git.blender.org/blender.git
+URL: https://projects.blender.org/blender/blender.git
 License: Apache 2.0
 Upstream version: N/A
 Local modifications: None
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -12,6 +12,7 @@ from bpy.props import (
    PointerProperty,
    StringProperty,
 )
 from bpy.app.translations import pgettext_iface as iface_
 from math import pi
@@ -1664,30 +1665,51 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            col.label(text="No compatible GPUs found for Cycles", icon='INFO')
            if device_type == 'CUDA':
-                col.label(text="Requires NVIDIA GPU with compute capability 3.0", icon='BLANK1')
+                compute_capability = "3.0"
                col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
                          icon='BLANK1', translate=False)
            elif device_type == 'OPTIX':
-                col.label(text="Requires NVIDIA GPU with compute capability 5.0", icon='BLANK1')
+                compute_capability = "5.0"
-                col.label(text="and NVIDIA driver version 470 or newer", icon='BLANK1')
+                driver_version = "470"
                col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
                          icon='BLANK1', translate=False)
                col.label(text="and NVIDIA driver version %s or newer" % driver_version,
                          icon='BLANK1', translate=False)
            elif device_type == 'HIP':
-                import sys
+                if True:
-                if sys.platform[:3] == "win":
+                    col.label(text="HIP temporarily disabled due to compiler bugs", icon='BLANK1')
-                    col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
+                else:
-                    col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
+                    import sys
-                elif sys.platform.startswith("linux"):
+                    if sys.platform[:3] == "win":
-                    col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
+                        driver_version = "21.Q4"
-                    col.label(text="and AMD driver version 22.10 or newer", icon='BLANK1')
+                        col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
                        col.label(text=iface_("and AMD Radeon Pro %s driver or newer") % driver_version,
                                  icon='BLANK1', translate=False)
                    elif sys.platform.startswith("linux"):
                        driver_version = "22.10"
                        col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
                        col.label(text=iface_("and AMD driver version %s or newer") % driver_version, icon='BLANK1',
                                  translate=False)
            elif device_type == 'ONEAPI':
                import sys
                if sys.platform.startswith("win"):
                    driver_version = "101.4032"
                    col.label(text="Requires Intel GPU with Xe-HPG architecture", icon='BLANK1')
-                    col.label(text="and Windows driver version 101.4032 or newer", icon='BLANK1')
+                    col.label(text=iface_("and Windows driver version %s or newer") % driver_version,
                              icon='BLANK1', translate=False)
                elif sys.platform.startswith("linux"):
                    driver_version = "1.3.24931"
                    col.label(text="Requires Intel GPU with Xe-HPG architecture and", icon='BLANK1')
-                    col.label(text="  - intel-level-zero-gpu version 1.3.24931 or newer", icon='BLANK1')
+                    col.label(text=iface_("  - intel-level-zero-gpu version %s or newer") % driver_version,
                              icon='BLANK1', translate=False)
                    col.label(text="  - oneAPI Level-Zero Loader", icon='BLANK1')
            elif device_type == 'METAL':
-                col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
+                silicon_mac_version = "12.2"
-                col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1')
+                amd_mac_version = "12.3"
                col.label(text=iface_("Requires Apple Silicon with macOS %s or newer") % silicon_mac_version,
                          icon='BLANK1', translate=False)
                col.label(text=iface_("or AMD with macOS %s or newer") % amd_mac_version, icon='BLANK1',
                          translate=False)
            return
        for device in devices:
@@ -1723,12 +1745,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        if compute_device_type == 'METAL':
            import platform
-            # MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
+            import re
-            # Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
+            is_navi_2 = False
-            if platform.machine() == 'arm64':
+            for device in devices:
                if re.search(r"((RX)|(Pro)|(PRO))\s+W?6\d00X", device.name):
                    is_navi_2 = True
                    break
            # MetalRT only works on Apple Silicon and Navi2.
            is_arm64 = platform.machine() == 'arm64'
            if is_arm64 or is_navi_2:
                col = layout.column()
                col.use_property_split = True
-                col.prop(self, "kernel_optimization_level")
+                # Kernel specialization is only supported on Apple Silicon
                if is_arm64:
                    col.prop(self, "kernel_optimization_level")
                col.prop(self, "use_metalrt")
    def draw(self, context):
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -20,7 +20,7 @@ class CyclesPresetPanel(PresetPanel, Panel):
    @staticmethod
    def post_cb(context):
        # Modify an arbitrary built-in scene property to force a depsgraph
-        # update, because add-on properties don't. (see T62325)
+        # update, because add-on properties don't. (see #62325)
        render = context.scene.render
        render.filter_size = render.filter_size
--- a/intern/cycles/blender/display_driver.cpp
+++ b/intern/cycles/blender/display_driver.cpp
@@ -105,11 +105,12 @@ GPUShader *BlenderFallbackDisplayShader::bind(int width, int height)
  /* Bind shader now to enable uniform assignment. */
  GPU_shader_bind(shader_program_);
-  GPU_shader_uniform_int(shader_program_, image_texture_location_, 0);
+  int slot = 0;
  GPU_shader_uniform_int_ex(shader_program_, image_texture_location_, 1, 1, &slot);
  float size[2];
  size[0] = width;
  size[1] = height;
-  GPU_shader_uniform_vector(shader_program_, fullscreen_location_, 2, 1, size);
+  GPU_shader_uniform_float_ex(shader_program_, fullscreen_location_, 2, 1, size);
  return shader_program_;
 }
--- a/intern/cycles/blender/image.cpp
+++ b/intern/cycles/blender/image.cpp
@@ -20,7 +20,7 @@ BlenderImageLoader::BlenderImageLoader(BL::Image b_image,
    : b_image(b_image),
      frame(frame),
      tile_number(tile_number),
-      /* Don't free cache for preview render to avoid race condition from T93560, to be fixed
+      /* Don't free cache for preview render to avoid race condition from #93560, to be fixed
       * properly later as we are close to release. */
      free_cache(!is_preview_render && !b_image.has_data())
 {
@@ -72,7 +72,7 @@ bool BlenderImageLoader::load_metadata(const ImageDeviceFeatures &, ImageMetaDat
    metadata.colorspace = u_colorspace_raw;
  }
  else {
-    /* In some cases (e.g. T94135), the colorspace setting in Blender gets updated as part of the
+    /* In some cases (e.g. #94135), the colorspace setting in Blender gets updated as part of the
     * metadata queries in this function, so update the colorspace setting here. */
    PointerRNA colorspace_ptr = b_image.colorspace_settings().ptr;
    metadata.colorspace = get_enum_identifier(colorspace_ptr, "name");
--- a/intern/cycles/blender/light.cpp
+++ b/intern/cycles/blender/light.cpp
@@ -24,7 +24,7 @@ void BlenderSync::sync_light(BL::Object &b_parent,
  Light *light = light_map.find(key);
  /* Check if the transform was modified, in case a linked collection is moved we do not get a
-   * specific depsgraph update (T88515). This also mimics the behavior for Objects. */
+   * specific depsgraph update (#88515). This also mimics the behavior for Objects. */
  const bool tfm_updated = (light && light->get_tfm() != tfm);
  /* Update if either object or light data changed. */
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -94,7 +94,7 @@ void python_thread_state_restore(void **python_thread_state)
  *python_thread_state = NULL;
 }
-static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
+static const char *PyC_UnicodeAsBytes(PyObject *py_str, PyObject **coerce)
 {
  const char *result = PyUnicode_AsUTF8(py_str);
  if (result) {
@@ -131,8 +131,8 @@ static PyObject *init_func(PyObject * /*self*/, PyObject *args)
  }
  PyObject *path_coerce = nullptr, *user_path_coerce = nullptr;
-  path_init(PyC_UnicodeAsByte(path, &path_coerce),
+  path_init(PyC_UnicodeAsBytes(path, &path_coerce),
-            PyC_UnicodeAsByte(user_path, &user_path_coerce));
+            PyC_UnicodeAsBytes(user_path, &user_path_coerce));
  Py_XDECREF(path_coerce);
  Py_XDECREF(user_path_coerce);
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -404,7 +404,7 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
     * point we know that we've got everything to render current view layer.
     */
    /* At the moment we only free if we are not doing multi-view
-     * (or if we are rendering the last view). See T58142/D4239 for discussion.
+     * (or if we are rendering the last view). See #58142/D4239 for discussion.
     */
    if (view_index == num_views - 1) {
      free_blender_memory_if_possible();
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -349,8 +349,7 @@ void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)
  bool use_light_tree = get_boolean(cscene, "use_light_tree");
  integrator->set_use_light_tree(use_light_tree);
-  integrator->set_light_sampling_threshold(
+  integrator->set_light_sampling_threshold(get_float(cscene, "light_sampling_threshold"));
      (use_light_tree) ? 0.0f : get_float(cscene, "light_sampling_threshold"));
  if (integrator->use_light_tree_is_modified()) {
    scene->light_manager->tag_update(scene, LightManager::UPDATE_ALL);
@@ -766,7 +765,7 @@ void BlenderSync::free_data_after_sync(BL::Depsgraph &b_depsgraph)
      (BlenderSession::headless || is_interface_locked) &&
      /* Baking re-uses the depsgraph multiple times, clearing crashes
       * reading un-evaluated mesh data which isn't aligned with the
-       * geometry we're baking, see T71012. */
+       * geometry we're baking, see #71012. */
      !scene->bake_manager->get_baking() &&
      /* Persistent data must main caches for performance and correctness. */
      !is_persistent_data;
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -42,12 +42,15 @@ endif()
 ###########################################################################
 if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
-  find_package(HIP)
+  set(WITH_CYCLES_HIP_BINARIES OFF)
-  set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES)
+  message(STATUS "HIP temporarily disabled due to compiler bugs")
-  if(HIP_FOUND)
+  # find_package(HIP)
-    message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})")
+  # set_and_warn_library_found("HIP compiler" HIP_FOUND WITH_CYCLES_HIP_BINARIES)
-  endif()
+
  # if(HIP_FOUND)
  #   message(STATUS "Found HIP ${HIP_HIPCC_EXECUTABLE} (${HIP_VERSION})")
  # endif()
 endif()
 if(NOT WITH_HIP_DYNLOAD)
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error)
 }
 CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
  /* Verify that base class types can be used with specific backend types */
  static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
  static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
  first_error = true;
  cuDevId = info.num;
@@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  need_texture_info = false;
  device_texture_headroom = 0;
  device_working_headroom = 0;
  move_texture_to_host = false;
  map_host_limit = 0;
  map_host_used = 0;
  can_map_host = 0;
  pitch_alignment = 0;
  /* Initialize CUDA. */
@@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  cuda_assert(
+  int value;
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
  can_map_host = value != 0;
  cuda_assert(cuDeviceGetAttribute(
      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }
-void CUDADevice::init_host_memory()
+void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
 {
  /* Limit amount of host mapped memory, because allocating too much can
   * cause system instability. Leave at least half or 4 GB of system
   * memory free, whichever is smaller. */
  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
  size_t system_ram = system_physical_ram();
  if (system_ram > 0) {
    if (system_ram / 2 > default_limit) {
      map_host_limit = system_ram - default_limit;
    }
    else {
      map_host_limit = system_ram / 2;
    }
  }
  else {
    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
    map_host_limit = 0;
  }
  /* Amount of device memory to keep is free after texture memory
   * and working memory allocations respectively. We set the working
   * memory limit headroom lower so that some space is left after all
   * texture memory allocations. */
  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
 }
 void CUDADevice::load_texture_info()
 {
  if (need_texture_info) {
    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
    need_texture_info = false;
    texture_info.copy_to_device();
  }
 }
 void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
 {
  /* Break out of recursive call, which can happen when moving memory on a multi device. */
  static bool any_device_moving_textures_to_host = false;
  if (any_device_moving_textures_to_host) {
    return;
  }
  /* Signal to reallocate textures in host memory only. */
  move_texture_to_host = true;
  while (size > 0) {
    /* Find suitable memory allocation to move. */
    device_memory *max_mem = NULL;
    size_t max_size = 0;
    bool max_is_image = false;
    thread_scoped_lock lock(cuda_mem_map_mutex);
    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
      device_memory &mem = *pair.first;
      CUDAMem *cmem = &pair.second;
      /* Can only move textures allocated on this device (and not those from peer devices).
       * And need to ignore memory that is already on the host. */
      if (!mem.is_resident(this) || cmem->use_mapped_host) {
        continue;
      }
      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
                        (&mem != &texture_info);
      bool is_image = is_texture && (mem.data_height > 1);
      /* Can't move this type of memory. */
      if (!is_texture || cmem->array) {
        continue;
      }
      /* For other textures, only move image textures. */
      if (for_texture && !is_image) {
        continue;
      }
      /* Try to move largest allocation, prefer moving images. */
      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
        max_is_image = is_image;
        max_size = mem.device_size;
        max_mem = &mem;
      }
    }
    lock.unlock();
    /* Move to host memory. This part is mutex protected since
     * multiple CUDA devices could be moving the memory. The
     * first one will do it, and the rest will adopt the pointer. */
    if (max_mem) {
      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
      static thread_mutex move_mutex;
      thread_scoped_lock lock(move_mutex);
      any_device_moving_textures_to_host = true;
      /* Potentially need to call back into multi device, so pointer mapping
       * and peer devices are updated. This is also necessary since the device
       * pointer may just be a key here, so cannot be accessed and freed directly.
       * Unfortunately it does mean that memory is reallocated on all other
       * devices as well, which is potentially dangerous when still in use (since
       * a thread rendering on another devices would only be caught in this mutex
       * if it so happens to do an allocation at the same time as well. */
      max_mem->device_copy_to();
      size = (max_size >= size) ? 0 : size - max_size;
      any_device_moving_textures_to_host = false;
    }
    else {
      break;
    }
  }
  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
  move_texture_to_host = false;
  /* Update texture info array with new pointers. */
  load_texture_info();
 }
 CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
 {
  CUDAContextScope scope(this);
  CUdeviceptr device_pointer = 0;
  size_t size = mem.memory_size() + pitch_padding;
  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
  const char *status = "";
  /* First try allocating in device memory, respecting headroom. We make
   * an exception for texture info. It is small and frequently accessed,
   * so treat it as working memory.
   *
   * If there is not enough room for working memory, we will try to move
   * textures to host memory, assuming the performance impact would have
   * been worse for working memory. */
  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
  bool is_image = is_texture && (mem.data_height > 1);
  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
  size_t total = 0, free = 0;
  cuMemGetInfo(&free, &total);
  /* Move textures to host memory if needed. */
  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
    move_textures_to_host(size + headroom - free, is_texture);
    cuMemGetInfo(&free, &total);
  }
  /* Allocate in device memory. */
  if (!move_texture_to_host && (size + headroom) < free) {
    mem_alloc_result = cuMemAlloc(&device_pointer, size);
    if (mem_alloc_result == CUDA_SUCCESS) {
      status = " in device memory";
    }
  }
  /* Fall back to mapped host memory if needed and possible. */
  void *shared_pointer = 0;
  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
    if (mem.shared_pointer) {
      /* Another device already allocated host memory. */
      mem_alloc_result = CUDA_SUCCESS;
      shared_pointer = mem.shared_pointer;
    }
    else if (map_host_used + size < map_host_limit) {
      /* Allocate host memory ourselves. */
      mem_alloc_result = cuMemHostAlloc(
          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
    }
    if (mem_alloc_result == CUDA_SUCCESS) {
      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
      map_host_used += size;
      status = " in host memory";
    }
  }
  if (mem_alloc_result != CUDA_SUCCESS) {
    if (mem.type == MEM_DEVICE_ONLY) {
      status = " failed, out of device memory";
      set_error("System is out of GPU memory");
    }
    else {
      status = " failed, out of device and host memory";
      set_error("System is out of GPU and shared host memory");
    }
  }
  if (mem.name) {
    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
              << string_human_readable_number(mem.memory_size()) << " bytes. ("
              << string_human_readable_size(mem.memory_size()) << ")" << status;
  }
  mem.device_pointer = (device_ptr)device_pointer;
  mem.device_size = size;
  stats.mem_alloc(size);
  if (!mem.device_pointer) {
    return NULL;
  }
  /* Insert into map of allocations. */
  thread_scoped_lock lock(cuda_mem_map_mutex);
  CUDAMem *cmem = &cuda_mem_map[&mem];
  if (shared_pointer != 0) {
    /* Replace host pointer with our host allocation. Only works if
     * CUDA memory layout is the same and has no pitch padding. Also
     * does not work if we move textures to host during a render,
     * since other devices might be using the memory. */
    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
        mem.host_pointer != shared_pointer) {
      memcpy(shared_pointer, mem.host_pointer, size);
      /* A Call to device_memory::host_free() should be preceded by
       * a call to device_memory::device_free() for host memory
       * allocated by a device to be handled properly. Two exceptions
       * are here and a call in OptiXDevice::generic_alloc(), where
       * the current host memory can be assumed to be allocated by
       * device_memory::host_alloc(), not by a device */
      mem.host_free();
      mem.host_pointer = shared_pointer;
    }
    mem.shared_pointer = shared_pointer;
    mem.shared_counter++;
    cmem->use_mapped_host = true;
  }
  else {
    cmem->use_mapped_host = false;
  }
  return cmem;
 }
-void CUDADevice::generic_copy_to(device_memory &mem)
+bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
+  CUDAContextScope scope(this);
    return;
  }
-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+  CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+  return mem_alloc_result == CUDA_SUCCESS;
   * mem.host_pointer. */
  thread_scoped_lock lock(cuda_mem_map_mutex);
  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const CUDAContextScope scope(this);
    cuda_assert(
        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
  }
 }
-void CUDADevice::generic_free(device_memory &mem)
+void CUDADevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
+  CUDAContextScope scope(this);
    CUDAContextScope scope(this);
    thread_scoped_lock lock(cuda_mem_map_mutex);
    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
    const CUDAMem &cmem = cuda_mem_map[&mem];
-    /* If cmem.use_mapped_host is true, reference counting is used
+  cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
-     * to safely free a mapped host memory. */
+}
-    if (cmem.use_mapped_host) {
+bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
-      assert(mem.shared_pointer);
+{
-      if (mem.shared_pointer) {
+  CUDAContextScope scope(this);
        assert(mem.shared_counter > 0);
        if (--mem.shared_counter == 0) {
          if (mem.host_pointer == mem.shared_pointer) {
            mem.host_pointer = 0;
          }
          cuMemFreeHost(mem.shared_pointer);
          mem.shared_pointer = 0;
        }
      }
      map_host_used -= mem.device_size;
    }
    else {
      /* Free device memory. */
      cuda_assert(cuMemFree(mem.device_pointer));
    }
-    stats.mem_free(mem.device_size);
+  CUresult mem_alloc_result = cuMemHostAlloc(
-    mem.device_pointer = 0;
+      &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-    mem.device_size = 0;
+  return mem_alloc_result == CUDA_SUCCESS;
 }
-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
+void CUDADevice::free_host(void *shared_pointer)
-  }
+{
  CUDAContextScope scope(this);
  cuMemFreeHost(shared_pointer);
 }
 bool CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
 {
  CUDAContextScope scope(this);
  cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
  return true;
 }
 void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
 {
  const CUDAContextScope scope(this);
  cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
 }
 void CUDADevice::mem_alloc(device_memory &mem)
@@ -868,8 +613,8 @@ void CUDADevice::mem_zero(device_memory &mem)
  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
+  thread_scoped_lock lock(device_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const CUDAContextScope scope(this);
    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
  }
@@ -994,19 +739,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
      return;
  }
-  CUDAMem *cmem = NULL;
+  Mem *cmem = NULL;
  CUarray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;
  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
+    thread_scoped_lock lock(device_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
    if (mem.data_depth > 1) {
      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -1050,10 +795,10 @@ void CUDADevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
+    thread_scoped_lock lock(device_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@@ -1137,8 +882,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-    thread_scoped_lock lock(cuda_mem_map_mutex);
+    thread_scoped_lock lock(device_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    cmem = &device_mem_map[&mem];
    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
@@ -1153,9 +898,9 @@ void CUDADevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
+    thread_scoped_lock lock(device_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+    const Mem &cmem = device_mem_map[&mem];
    if (cmem.texobject) {
      /* Free bindless texture. */
@@ -1164,16 +909,16 @@ void CUDADevice::tex_free(device_texture &mem)
    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      cuArrayDestroy(cmem.array);
+      cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN
 class DeviceQueue;
-class CUDADevice : public Device {
+class CUDADevice : public GPUDevice {
  friend class CUDAContextScope;
@@ -29,36 +29,11 @@ class CUDADevice : public Device {
  CUdevice cuDevice;
  CUcontext cuContext;
  CUmodule cuModule;
  size_t device_texture_headroom;
  size_t device_working_headroom;
  bool move_texture_to_host;
  size_t map_host_used;
  size_t map_host_limit;
  int can_map_host;
  int pitch_alignment;
  int cuDevId;
  int cuDevArchitecture;
  bool first_error;
  struct CUDAMem {
    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
    {
    }
    CUtexObject texobject;
    CUarray array;
    /* If true, a mapped host memory in shared_pointer is being used. */
    bool use_mapped_host;
  };
  typedef map<device_memory *, CUDAMem> CUDAMemMap;
  CUDAMemMap cuda_mem_map;
  thread_mutex cuda_mem_map_mutex;
  /* Bindless Textures */
  device_vector<TextureInfo> texture_info;
  bool need_texture_info;
  CUDADeviceKernels kernels;
  static bool have_precompiled_kernels();
@@ -88,17 +63,13 @@ class CUDADevice : public Device {
  void reserve_local_memory(const uint kernel_features);
-  void init_host_memory();
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
-
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
-  void load_texture_info();
+  virtual void free_device(void *device_pointer) override;
-
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
-  void move_textures_to_host(size_t size, bool for_texture);
+  virtual void free_host(void *shared_pointer) override;
-
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
  void generic_copy_to(device_memory &mem);
  void generic_free(device_memory &mem);
  void mem_alloc(device_memory &mem) override;
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -452,6 +452,320 @@ void *Device::get_cpu_osl_memory()
  return nullptr;
 }
 GPUDevice::~GPUDevice() noexcept(false)
 {
 }
 bool GPUDevice::load_texture_info()
 {
  if (need_texture_info) {
    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
    need_texture_info = false;
    texture_info.copy_to_device();
    return true;
  }
  else {
    return false;
  }
 }
 void GPUDevice::init_host_memory(size_t preferred_texture_headroom,
                                 size_t preferred_working_headroom)
 {
  /* Limit amount of host mapped memory, because allocating too much can
   * cause system instability. Leave at least half or 4 GB of system
   * memory free, whichever is smaller. */
  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
  size_t system_ram = system_physical_ram();
  if (system_ram > 0) {
    if (system_ram / 2 > default_limit) {
      map_host_limit = system_ram - default_limit;
    }
    else {
      map_host_limit = system_ram / 2;
    }
  }
  else {
    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
    map_host_limit = 0;
  }
  /* Amount of device memory to keep free after texture memory
   * and working memory allocations respectively. We set the working
   * memory limit headroom lower than the working one so there
   * is space left for it. */
  device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom :
                                                             32 * 1024 * 1024LL;  // 32MB
  device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom :
                                                             128 * 1024 * 1024LL;  // 128MB
  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
 }
 void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
 {
  /* Break out of recursive call, which can happen when moving memory on a multi device. */
  static bool any_device_moving_textures_to_host = false;
  if (any_device_moving_textures_to_host) {
    return;
  }
  /* Signal to reallocate textures in host memory only. */
  move_texture_to_host = true;
  while (size > 0) {
    /* Find suitable memory allocation to move. */
    device_memory *max_mem = NULL;
    size_t max_size = 0;
    bool max_is_image = false;
    thread_scoped_lock lock(device_mem_map_mutex);
    foreach (MemMap::value_type &pair, device_mem_map) {
      device_memory &mem = *pair.first;
      Mem *cmem = &pair.second;
      /* Can only move textures allocated on this device (and not those from peer devices).
       * And need to ignore memory that is already on the host. */
      if (!mem.is_resident(this) || cmem->use_mapped_host) {
        continue;
      }
      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
                        (&mem != &texture_info);
      bool is_image = is_texture && (mem.data_height > 1);
      /* Can't move this type of memory. */
      if (!is_texture || cmem->array) {
        continue;
      }
      /* For other textures, only move image textures. */
      if (for_texture && !is_image) {
        continue;
      }
      /* Try to move largest allocation, prefer moving images. */
      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
        max_is_image = is_image;
        max_size = mem.device_size;
        max_mem = &mem;
      }
    }
    lock.unlock();
    /* Move to host memory. This part is mutex protected since
     * multiple backend devices could be moving the memory. The
     * first one will do it, and the rest will adopt the pointer. */
    if (max_mem) {
      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
      static thread_mutex move_mutex;
      thread_scoped_lock lock(move_mutex);
      any_device_moving_textures_to_host = true;
      /* Potentially need to call back into multi device, so pointer mapping
       * and peer devices are updated. This is also necessary since the device
       * pointer may just be a key here, so cannot be accessed and freed directly.
       * Unfortunately it does mean that memory is reallocated on all other
       * devices as well, which is potentially dangerous when still in use (since
       * a thread rendering on another devices would only be caught in this mutex
       * if it so happens to do an allocation at the same time as well. */
      max_mem->device_copy_to();
      size = (max_size >= size) ? 0 : size - max_size;
      any_device_moving_textures_to_host = false;
    }
    else {
      break;
    }
  }
  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
  move_texture_to_host = false;
  /* Update texture info array with new pointers. */
  load_texture_info();
 }
 GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
 {
  void *device_pointer = 0;
  size_t size = mem.memory_size() + pitch_padding;
  bool mem_alloc_result = false;
  const char *status = "";
  /* First try allocating in device memory, respecting headroom. We make
   * an exception for texture info. It is small and frequently accessed,
   * so treat it as working memory.
   *
   * If there is not enough room for working memory, we will try to move
   * textures to host memory, assuming the performance impact would have
   * been worse for working memory. */
  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
  bool is_image = is_texture && (mem.data_height > 1);
  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
  size_t total = 0, free = 0;
  get_device_memory_info(total, free);
  /* Move textures to host memory if needed. */
  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
    move_textures_to_host(size + headroom - free, is_texture);
    get_device_memory_info(total, free);
  }
  /* Allocate in device memory. */
  if (!move_texture_to_host && (size + headroom) < free) {
    mem_alloc_result = alloc_device(device_pointer, size);
    if (mem_alloc_result) {
      device_mem_in_use += size;
      status = " in device memory";
    }
  }
  /* Fall back to mapped host memory if needed and possible. */
  void *shared_pointer = 0;
  if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) {
    if (mem.shared_pointer) {
      /* Another device already allocated host memory. */
      mem_alloc_result = true;
      shared_pointer = mem.shared_pointer;
    }
    else if (map_host_used + size < map_host_limit) {
      /* Allocate host memory ourselves. */
      mem_alloc_result = alloc_host(shared_pointer, size);
      assert((mem_alloc_result && shared_pointer != 0) ||
             (!mem_alloc_result && shared_pointer == 0));
    }
    if (mem_alloc_result) {
      assert(transform_host_pointer(device_pointer, shared_pointer));
      map_host_used += size;
      status = " in host memory";
    }
  }
  if (!mem_alloc_result) {
    if (mem.type == MEM_DEVICE_ONLY) {
      status = " failed, out of device memory";
      set_error("System is out of GPU memory");
    }
    else {
      status = " failed, out of device and host memory";
      set_error("System is out of GPU and shared host memory");
    }
  }
  if (mem.name) {
    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
              << string_human_readable_number(mem.memory_size()) << " bytes. ("
              << string_human_readable_size(mem.memory_size()) << ")" << status;
  }
  mem.device_pointer = (device_ptr)device_pointer;
  mem.device_size = size;
  stats.mem_alloc(size);
  if (!mem.device_pointer) {
    return NULL;
  }
  /* Insert into map of allocations. */
  thread_scoped_lock lock(device_mem_map_mutex);
  Mem *cmem = &device_mem_map[&mem];
  if (shared_pointer != 0) {
    /* Replace host pointer with our host allocation. Only works if
     * memory layout is the same and has no pitch padding. Also
     * does not work if we move textures to host during a render,
     * since other devices might be using the memory. */
    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
        mem.host_pointer != shared_pointer) {
      memcpy(shared_pointer, mem.host_pointer, size);
      /* A Call to device_memory::host_free() should be preceded by
       * a call to device_memory::device_free() for host memory
       * allocated by a device to be handled properly. Two exceptions
       * are here and a call in OptiXDevice::generic_alloc(), where
       * the current host memory can be assumed to be allocated by
       * device_memory::host_alloc(), not by a device */
      mem.host_free();
      mem.host_pointer = shared_pointer;
    }
    mem.shared_pointer = shared_pointer;
    mem.shared_counter++;
    cmem->use_mapped_host = true;
  }
  else {
    cmem->use_mapped_host = false;
  }
  return cmem;
 }
 void GPUDevice::generic_free(device_memory &mem)
 {
  if (mem.device_pointer) {
    thread_scoped_lock lock(device_mem_map_mutex);
    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
    const Mem &cmem = device_mem_map[&mem];
    /* If cmem.use_mapped_host is true, reference counting is used
     * to safely free a mapped host memory. */
    if (cmem.use_mapped_host) {
      assert(mem.shared_pointer);
      if (mem.shared_pointer) {
        assert(mem.shared_counter > 0);
        if (--mem.shared_counter == 0) {
          if (mem.host_pointer == mem.shared_pointer) {
            mem.host_pointer = 0;
          }
          free_host(mem.shared_pointer);
          mem.shared_pointer = 0;
        }
      }
      map_host_used -= mem.device_size;
    }
    else {
      /* Free device memory. */
      free_device((void *)mem.device_pointer);
      device_mem_in_use -= mem.device_size;
    }
    stats.mem_free(mem.device_size);
    mem.device_pointer = 0;
    mem.device_size = 0;
    device_mem_map.erase(device_mem_map.find(&mem));
  }
 }
 void GPUDevice::generic_copy_to(device_memory &mem)
 {
  if (!mem.host_pointer || !mem.device_pointer) {
    return;
  }
  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
   * backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should
   * copy data from mem.host_pointer. */
  thread_scoped_lock lock(device_mem_map_mutex);
  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size());
  }
 }
 /* DeviceInfo */
 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -309,6 +309,93 @@ class Device {
  static uint devices_initialized_mask;
 };
 /* Device, which is GPU, with some common functionality for GPU backends */
 class GPUDevice : public Device {
 protected:
  GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
      : Device(info_, stats_, profiler_),
        texture_info(this, "texture_info", MEM_GLOBAL),
        need_texture_info(false),
        can_map_host(false),
        map_host_used(0),
        map_host_limit(0),
        device_texture_headroom(0),
        device_working_headroom(0),
        device_mem_map(),
        device_mem_map_mutex(),
        move_texture_to_host(false),
        device_mem_in_use(0)
  {
  }
 public:
  virtual ~GPUDevice() noexcept(false);
  /* For GPUs that can use bindless textures in some way or another. */
  device_vector<TextureInfo> texture_info;
  bool need_texture_info;
  /* Returns true if the texture info was copied to the device (meaning, some more
   * re-initialization might be needed). */
  virtual bool load_texture_info();
 protected:
  /* Memory allocation, only accessed through device_memory. */
  friend class device_memory;
  bool can_map_host;
  size_t map_host_used;
  size_t map_host_limit;
  size_t device_texture_headroom;
  size_t device_working_headroom;
  typedef unsigned long long texMemObject;
  typedef unsigned long long arrayMemObject;
  struct Mem {
    Mem() : texobject(0), array(0), use_mapped_host(false)
    {
    }
    texMemObject texobject;
    arrayMemObject array;
    /* If true, a mapped host memory in shared_pointer is being used. */
    bool use_mapped_host;
  };
  typedef map<device_memory *, Mem> MemMap;
  MemMap device_mem_map;
  thread_mutex device_mem_map_mutex;
  bool move_texture_to_host;
  /* Simple counter which will try to track amount of used device memory */
  size_t device_mem_in_use;
  virtual void init_host_memory(size_t preferred_texture_headroom = 0,
                                size_t preferred_working_headroom = 0);
  virtual void move_textures_to_host(size_t size, bool for_texture);
  /* Allocation, deallocation and copy functions, with corresponding
   * support of device/host allocations. */
  virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
  virtual void generic_free(device_memory &mem);
  virtual void generic_copy_to(device_memory &mem);
  /* total - amount of device memory, free - amount of available device memory */
  virtual void get_device_memory_info(size_t &total, size_t &free) = 0;
  virtual bool alloc_device(void *&device_pointer, size_t size) = 0;
  virtual void free_device(void *device_pointer) = 0;
  virtual bool alloc_host(void *&shared_pointer, size_t size) = 0;
  virtual void free_host(void *shared_pointer) = 0;
  /* This function should return device pointer corresponding to shared pointer, which
   * is host buffer, allocated in `alloc_host`. The function should `true`, if such
   * address transformation is possible and `false` otherwise. */
  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0;
  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0;
 };
 CCL_NAMESPACE_END
 #endif /* __DEVICE_H__ */
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -53,8 +53,12 @@ void HIPDevice::set_error(const string &error)
 }
 HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
  /* Verify that base class types can be used with specific backend types */
  static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t));
  static_assert(sizeof(arrayMemObject) == sizeof(hArray));
  first_error = true;
  hipDevId = info.num;
@@ -65,12 +69,6 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  need_texture_info = false;
  device_texture_headroom = 0;
  device_working_headroom = 0;
  move_texture_to_host = false;
  map_host_limit = 0;
  map_host_used = 0;
  can_map_host = 0;
  pitch_alignment = 0;
  /* Initialize HIP. */
@@ -91,7 +89,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* hipDeviceMapHost for mapping host memory when out of device memory.
   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  int value;
  hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice));
  can_map_host = value != 0;
  hip_assert(
      hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
@@ -460,305 +460,58 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }
-void HIPDevice::init_host_memory()
+void HIPDevice::get_device_memory_info(size_t &total, size_t &free)
 {
  /* Limit amount of host mapped memory, because allocating too much can
   * cause system instability. Leave at least half or 4 GB of system
   * memory free, whichever is smaller. */
  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
  size_t system_ram = system_physical_ram();
  if (system_ram > 0) {
    if (system_ram / 2 > default_limit) {
      map_host_limit = system_ram - default_limit;
    }
    else {
      map_host_limit = system_ram / 2;
    }
  }
  else {
    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
    map_host_limit = 0;
  }
  /* Amount of device memory to keep is free after texture memory
   * and working memory allocations respectively. We set the working
   * memory limit headroom lower so that some space is left after all
   * texture memory allocations. */
  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
 }
 void HIPDevice::load_texture_info()
 {
  if (need_texture_info) {
    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
    need_texture_info = false;
    texture_info.copy_to_device();
  }
 }
 void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
 {
  /* Break out of recursive call, which can happen when moving memory on a multi device. */
  static bool any_device_moving_textures_to_host = false;
  if (any_device_moving_textures_to_host) {
    return;
  }
  /* Signal to reallocate textures in host memory only. */
  move_texture_to_host = true;
  while (size > 0) {
    /* Find suitable memory allocation to move. */
    device_memory *max_mem = NULL;
    size_t max_size = 0;
    bool max_is_image = false;
    thread_scoped_lock lock(hip_mem_map_mutex);
    foreach (HIPMemMap::value_type &pair, hip_mem_map) {
      device_memory &mem = *pair.first;
      HIPMem *cmem = &pair.second;
      /* Can only move textures allocated on this device (and not those from peer devices).
       * And need to ignore memory that is already on the host. */
      if (!mem.is_resident(this) || cmem->use_mapped_host) {
        continue;
      }
      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
                        (&mem != &texture_info);
      bool is_image = is_texture && (mem.data_height > 1);
      /* Can't move this type of memory. */
      if (!is_texture || cmem->array) {
        continue;
      }
      /* For other textures, only move image textures. */
      if (for_texture && !is_image) {
        continue;
      }
      /* Try to move largest allocation, prefer moving images. */
      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
        max_is_image = is_image;
        max_size = mem.device_size;
        max_mem = &mem;
      }
    }
    lock.unlock();
    /* Move to host memory. This part is mutex protected since
     * multiple HIP devices could be moving the memory. The
     * first one will do it, and the rest will adopt the pointer. */
    if (max_mem) {
      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
      static thread_mutex move_mutex;
      thread_scoped_lock lock(move_mutex);
      any_device_moving_textures_to_host = true;
      /* Potentially need to call back into multi device, so pointer mapping
       * and peer devices are updated. This is also necessary since the device
       * pointer may just be a key here, so cannot be accessed and freed directly.
       * Unfortunately it does mean that memory is reallocated on all other
       * devices as well, which is potentially dangerous when still in use (since
       * a thread rendering on another devices would only be caught in this mutex
       * if it so happens to do an allocation at the same time as well. */
      max_mem->device_copy_to();
      size = (max_size >= size) ? 0 : size - max_size;
      any_device_moving_textures_to_host = false;
    }
    else {
      break;
    }
  }
  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
  move_texture_to_host = false;
  /* Update texture info array with new pointers. */
  load_texture_info();
 }
 HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
 {
  HIPContextScope scope(this);
  hipDeviceptr_t device_pointer = 0;
  size_t size = mem.memory_size() + pitch_padding;
  hipError_t mem_alloc_result = hipErrorOutOfMemory;
  const char *status = "";
  /* First try allocating in device memory, respecting headroom. We make
   * an exception for texture info. It is small and frequently accessed,
   * so treat it as working memory.
   *
   * If there is not enough room for working memory, we will try to move
   * textures to host memory, assuming the performance impact would have
   * been worse for working memory. */
  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
  bool is_image = is_texture && (mem.data_height > 1);
  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
  size_t total = 0, free = 0;
  hipMemGetInfo(&free, &total);
  /* Move textures to host memory if needed. */
  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
    move_textures_to_host(size + headroom - free, is_texture);
    hipMemGetInfo(&free, &total);
  }
  /* Allocate in device memory. */
  if (!move_texture_to_host && (size + headroom) < free) {
    mem_alloc_result = hipMalloc(&device_pointer, size);
    if (mem_alloc_result == hipSuccess) {
      status = " in device memory";
    }
  }
  /* Fall back to mapped host memory if needed and possible. */
  void *shared_pointer = 0;
  if (mem_alloc_result != hipSuccess && can_map_host) {
    if (mem.shared_pointer) {
      /* Another device already allocated host memory. */
      mem_alloc_result = hipSuccess;
      shared_pointer = mem.shared_pointer;
    }
    else if (map_host_used + size < map_host_limit) {
      /* Allocate host memory ourselves. */
      mem_alloc_result = hipHostMalloc(
          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
      assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
             (mem_alloc_result != hipSuccess && shared_pointer == 0));
    }
    if (mem_alloc_result == hipSuccess) {
      hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
      map_host_used += size;
      status = " in host memory";
    }
  }
  if (mem_alloc_result != hipSuccess) {
    status = " failed, out of device and host memory";
    set_error("System is out of GPU and shared host memory");
  }
  if (mem.name) {
    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
              << string_human_readable_number(mem.memory_size()) << " bytes. ("
              << string_human_readable_size(mem.memory_size()) << ")" << status;
  }
  mem.device_pointer = (device_ptr)device_pointer;
  mem.device_size = size;
  stats.mem_alloc(size);
  if (!mem.device_pointer) {
    return NULL;
  }
  /* Insert into map of allocations. */
  thread_scoped_lock lock(hip_mem_map_mutex);
  HIPMem *cmem = &hip_mem_map[&mem];
  if (shared_pointer != 0) {
    /* Replace host pointer with our host allocation. Only works if
     * HIP memory layout is the same and has no pitch padding. Also
     * does not work if we move textures to host during a render,
     * since other devices might be using the memory. */
    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
        mem.host_pointer != shared_pointer) {
      memcpy(shared_pointer, mem.host_pointer, size);
      /* A Call to device_memory::host_free() should be preceded by
       * a call to device_memory::device_free() for host memory
       * allocated by a device to be handled properly. Two exceptions
       * are here and a call in OptiXDevice::generic_alloc(), where
       * the current host memory can be assumed to be allocated by
       * device_memory::host_alloc(), not by a device */
      mem.host_free();
      mem.host_pointer = shared_pointer;
    }
    mem.shared_pointer = shared_pointer;
    mem.shared_counter++;
    cmem->use_mapped_host = true;
  }
  else {
    cmem->use_mapped_host = false;
  }
  return cmem;
 }
-void HIPDevice::generic_copy_to(device_memory &mem)
+bool HIPDevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
+  HIPContextScope scope(this);
    return;
  }
-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+  hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size);
-   * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
+  return mem_alloc_result == hipSuccess;
   * mem.host_pointer. */
  thread_scoped_lock lock(hip_mem_map_mutex);
  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const HIPContextScope scope(this);
    hip_assert(
        hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
  }
 }
-void HIPDevice::generic_free(device_memory &mem)
+void HIPDevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
+  HIPContextScope scope(this);
    HIPContextScope scope(this);
    thread_scoped_lock lock(hip_mem_map_mutex);
    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
    const HIPMem &cmem = hip_mem_map[&mem];
-    /* If cmem.use_mapped_host is true, reference counting is used
+  hip_assert(hipFree((hipDeviceptr_t)device_pointer));
-     * to safely free a mapped host memory. */
+}
-    if (cmem.use_mapped_host) {
+bool HIPDevice::alloc_host(void *&shared_pointer, size_t size)
-      assert(mem.shared_pointer);
+{
-      if (mem.shared_pointer) {
+  HIPContextScope scope(this);
        assert(mem.shared_counter > 0);
        if (--mem.shared_counter == 0) {
          if (mem.host_pointer == mem.shared_pointer) {
            mem.host_pointer = 0;
          }
          hipHostFree(mem.shared_pointer);
          mem.shared_pointer = 0;
        }
      }
      map_host_used -= mem.device_size;
    }
    else {
      /* Free device memory. */
      hip_assert(hipFree(mem.device_pointer));
    }
-    stats.mem_free(mem.device_size);
+  hipError_t mem_alloc_result = hipHostMalloc(
-    mem.device_pointer = 0;
+      &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
    mem.device_size = 0;
-    hip_mem_map.erase(hip_mem_map.find(&mem));
+  return mem_alloc_result == hipSuccess;
-  }
+}
 void HIPDevice::free_host(void *shared_pointer)
 {
  HIPContextScope scope(this);
  hipHostFree(shared_pointer);
 }
 bool HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
 {
  HIPContextScope scope(this);
  hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0));
  return true;
 }
 void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
 {
  const HIPContextScope scope(this);
  hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size));
 }
 void HIPDevice::mem_alloc(device_memory &mem)
@@ -823,8 +576,8 @@ void HIPDevice::mem_zero(device_memory &mem)
  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
+  thread_scoped_lock lock(device_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const HIPContextScope scope(this);
    hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
  }
@@ -951,19 +704,19 @@ void HIPDevice::tex_alloc(device_texture &mem)
      return;
  }
-  HIPMem *cmem = NULL;
+  Mem *cmem = NULL;
  hArray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;
  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(hip_mem_map_mutex);
+    thread_scoped_lock lock(device_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
    if (mem.data_depth > 1) {
      array_3d = (hArray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -1007,10 +760,10 @@ void HIPDevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);
-    thread_scoped_lock lock(hip_mem_map_mutex);
+    thread_scoped_lock lock(device_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@@ -1095,8 +848,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;
-    thread_scoped_lock lock(hip_mem_map_mutex);
+    thread_scoped_lock lock(device_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    cmem = &device_mem_map[&mem];
    hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
@@ -1111,9 +864,9 @@ void HIPDevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
+    thread_scoped_lock lock(device_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+    const Mem &cmem = device_mem_map[&mem];
    if (cmem.texobject) {
      /* Free bindless texture. */
@@ -1122,16 +875,16 @@ void HIPDevice::tex_free(device_texture &mem)
    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      hipArrayDestroy(cmem.array);
+      hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;
-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
@@ -1153,7 +906,7 @@ bool HIPDevice::should_use_graphics_interop()
   * possible, but from the empiric measurements it can be considerably slower than using naive
   * pixels copy. */
-  /* Disable graphics interop for now, because of driver bug in 21.40. See T92972 */
+  /* Disable graphics interop for now, because of driver bug in 21.40. See #92972 */
 #  if 0
  HIPContextScope scope(this);
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 class DeviceQueue;
-class HIPDevice : public Device {
+class HIPDevice : public GPUDevice {
  friend class HIPContextScope;
@@ -26,36 +26,11 @@ class HIPDevice : public Device {
  hipDevice_t hipDevice;
  hipCtx_t hipContext;
  hipModule_t hipModule;
  size_t device_texture_headroom;
  size_t device_working_headroom;
  bool move_texture_to_host;
  size_t map_host_used;
  size_t map_host_limit;
  int can_map_host;
  int pitch_alignment;
  int hipDevId;
  int hipDevArchitecture;
  bool first_error;
  struct HIPMem {
    HIPMem() : texobject(0), array(0), use_mapped_host(false)
    {
    }
    hipTextureObject_t texobject;
    hArray array;
    /* If true, a mapped host memory in shared_pointer is being used. */
    bool use_mapped_host;
  };
  typedef map<device_memory *, HIPMem> HIPMemMap;
  HIPMemMap hip_mem_map;
  thread_mutex hip_mem_map_mutex;
  /* Bindless Textures */
  device_vector<TextureInfo> texture_info;
  bool need_texture_info;
  HIPDeviceKernels kernels;
  static bool have_precompiled_kernels();
@@ -81,17 +56,13 @@ class HIPDevice : public Device {
  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);
-  void init_host_memory();
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
-
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
-  void load_texture_info();
+  virtual void free_device(void *device_pointer) override;
-
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
-  void move_textures_to_host(size_t size, bool for_texture);
+  virtual void free_host(void *shared_pointer) override;
-
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
-  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
  void generic_copy_to(device_memory &mem);
  void generic_free(device_memory &mem);
  void mem_alloc(device_memory &mem) override;
--- a/intern/cycles/device/hip/util.h
+++ b/intern/cycles/device/hip/util.h
@@ -51,7 +51,7 @@ static inline bool hipSupportsDevice(const int hipDevId)
  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
-  return (major >= 10);
+  return (major >= 9);
 }
 CCL_NAMESPACE_END
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -73,6 +73,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
      return "integrator_terminated_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
      return "integrator_sorted_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
      return "integrator_sort_bucket_pass";
    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
      return "integrator_sort_write_pass";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
      return "integrator_compact_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -247,6 +247,8 @@ class device_memory {
  bool is_resident(Device *sub_device) const;
 protected:
  friend class Device;
  friend class GPUDevice;
  friend class CUDADevice;
  friend class OptiXDevice;
  friend class HIPDevice;
--- a/intern/cycles/device/metal/bvh.h
+++ b/intern/cycles/device/metal/bvh.h
@@ -21,6 +21,7 @@ class BVHMetal : public BVH {
  API_AVAILABLE(macos(11.0))
  vector<id<MTLAccelerationStructure>> blas_array;
  vector<uint32_t> blas_lookup;
  bool motion_blur = false;
--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress,
    uint32_t instance_index = 0;
    uint32_t motion_transform_index = 0;
    // allocate look up buffer for wost case scenario
    uint64_t count = objects.size();
    blas_lookup.resize(count);
    for (Object *ob : objects) {
      /* Skip non-traceable objects */
      if (!ob->is_traceable())
@@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress,
      /* Set user instance ID to object index */
      int object_index = ob->get_device_index();
      uint32_t user_id = uint32_t(object_index);
      int currIndex = instance_index++;
      assert(user_id < blas_lookup.size());
      blas_lookup[user_id] = accel_struct_index;
      /* Bake into the appropriate descriptor */
      if (motion_blur) {
        MTLAccelerationStructureMotionInstanceDescriptor *instances =
            (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];
        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
@@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
      else {
        MTLAccelerationStructureUserIDInstanceDescriptor *instances =
            (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];
        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
--- a/intern/cycles/device/metal/device.mm
+++ b/intern/cycles/device/metal/device.mm
@@ -55,6 +55,9 @@ void device_metal_info(vector<DeviceInfo> &devices)
    info.denoisers = DENOISER_NONE;
    info.id = id;
    info.has_nanovdb = MetalInfo::get_device_vendor(device) == METAL_GPU_APPLE;
    info.has_light_tree = MetalInfo::get_device_vendor(device) != METAL_GPU_AMD;
    devices.push_back(info);
    device_index++;
  }
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -67,13 +67,21 @@ class MetalDevice : public Device {
  std::recursive_mutex metal_mem_map_mutex;
  /* Bindless Textures */
  bool is_texture(const TextureInfo &tex);
  device_vector<TextureInfo> texture_info;
  bool need_texture_info;
  id<MTLArgumentEncoder> mtlTextureArgEncoder = nil;
  id<MTLArgumentEncoder> mtlBufferArgEncoder = nil;
  id<MTLBuffer> buffer_bindings_1d = nil;
  id<MTLBuffer> texture_bindings_2d = nil;
  id<MTLBuffer> texture_bindings_3d = nil;
  std::vector<id<MTLTexture>> texture_slot_map;
  /* BLAS encoding & lookup */
  id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
  id<MTLBuffer> blas_buffer = nil;
  id<MTLBuffer> blas_lookup_buffer = nil;
  bool use_metalrt = false;
  MetalPipelineType kernel_specialization_level = PSO_GENERIC;
@@ -105,6 +113,8 @@ class MetalDevice : public Device {
  bool use_adaptive_compilation();
  bool use_local_atomic_sort() const;
  bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);
  void make_source(MetalPipelineType pso_type, const uint kernel_features);
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -91,11 +91,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    }
  }
  texture_bindings_2d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
  texture_bindings_3d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
  stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
  switch (device_vendor) {
    default:
      break;
@@ -105,6 +100,7 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    }
    case METAL_GPU_AMD: {
      max_threads_per_threadgroup = 128;
      use_metalrt = info.use_metalrt;
      break;
    }
    case METAL_GPU_APPLE: {
@@ -155,6 +151,16 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
  arg_desc_texture.dataType = MTLDataTypeTexture;
  arg_desc_texture.access = MTLArgumentAccessReadOnly;
  mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
  MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc] init];
  arg_desc_buffer.dataType = MTLDataTypePointer;
  arg_desc_buffer.access = MTLArgumentAccessReadOnly;
  mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
  buffer_bindings_1d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
  texture_bindings_2d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
  texture_bindings_3d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
  stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
                  texture_bindings_3d.allocatedSize);
  /* command queue for non-tracing work on the GPU */
  mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
@@ -179,6 +185,8 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    arg_desc_tex.dataType = MTLDataTypePointer;
    arg_desc_tex.access = MTLArgumentAccessReadOnly;
    arg_desc_tex.index = index++;
    [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_buf_1d */
    arg_desc_tex.index = index++;
    [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
    arg_desc_tex.index = index++;
@@ -192,6 +200,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
        arg_desc_as.access = MTLArgumentAccessReadOnly;
        MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
        arg_desc_ptrs.dataType = MTLDataTypePointer;
        arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
        MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
        arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
        arg_desc_ift.access = MTLArgumentAccessReadOnly;
@@ -204,14 +216,32 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
        arg_desc_ift.index = index++;
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
        arg_desc_ift.index = index++;
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
        arg_desc_ptrs.index = index++;
        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
        arg_desc_ptrs.index = index++;
        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */
        [arg_desc_ift release];
        [arg_desc_as release];
        [arg_desc_ptrs release];
      }
    }
    mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
    // preparing the blas arg encoder
    if (@available(macos 11.0, *)) {
      if (use_metalrt) {
        MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
        arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
        arg_desc_blas.access = MTLArgumentAccessReadOnly;
        mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
        [arg_desc_blas release];
      }
    }
    for (int i = 0; i < ancillary_desc.count; i++) {
      [ancillary_desc[i] release];
    }
@@ -230,22 +260,26 @@ MetalDevice::~MetalDevice()
   * existing_devices_mutex). */
  thread_scoped_lock lock(existing_devices_mutex);
-  for (auto &tex : texture_slot_map) {
+  int num_resources = texture_info.size();
-    if (tex) {
+  for (int res = 0; res < num_resources; res++) {
-      [tex release];
+    if (is_texture(texture_info[res])) {
-      tex = nil;
+      [texture_slot_map[res] release];
      texture_slot_map[res] = nil;
    }
  }
  flush_delayed_free_list();
  if (texture_bindings_2d) {
-    stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
+    stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
-
+                   texture_bindings_3d.allocatedSize);
    [buffer_bindings_1d release];
    [texture_bindings_2d release];
    [texture_bindings_3d release];
  }
  [mtlTextureArgEncoder release];
  [mtlBufferKernelParamsEncoder release];
  [mtlBufferArgEncoder release];
  [mtlASArgEncoder release];
  [mtlAncillaryArgEncoder release];
  [mtlGeneralCommandQueue release];
@@ -271,6 +305,11 @@ bool MetalDevice::use_adaptive_compilation()
  return DebugFlags().metal.adaptive_compile;
 }
 bool MetalDevice::use_local_atomic_sort() const
 {
  return DebugFlags().metal.use_local_atomic_sort;
 }
 void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
 {
  string global_defines;
@@ -278,6 +317,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }
  if (use_local_atomic_sort()) {
    global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
  }
  if (use_metalrt) {
    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
@@ -300,6 +343,9 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
      break;
    case METAL_GPU_APPLE:
      global_defines += "#define __KERNEL_METAL_APPLE__\n";
 #  ifdef WITH_NANOVDB
      global_defines += "#define WITH_NANOVDB\n";
 #  endif
      break;
  }
@@ -514,6 +560,11 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
  }
 }
 bool MetalDevice::is_texture(const TextureInfo &tex)
 {
  return (tex.depth > 0 || tex.height > 0);
 }
 void MetalDevice::load_texture_info()
 {
  if (need_texture_info) {
@@ -525,21 +576,20 @@ void MetalDevice::load_texture_info()
    for (int tex = 0; tex < num_textures; tex++) {
      uint64_t offset = tex * sizeof(void *);
-
+      if (is_texture(texture_info[tex]) && texture_slot_map[tex]) {
-      id<MTLTexture> metal_texture = texture_slot_map[tex];
+        id<MTLTexture> metal_texture = texture_slot_map[tex];
      if (!metal_texture) {
        [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
        [mtlTextureArgEncoder setTexture:nil atIndex:0];
        [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
        [mtlTextureArgEncoder setTexture:nil atIndex:0];
      }
      else {
        MTLTextureType type = metal_texture.textureType;
        [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
        [mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
        [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
        [mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
      }
      else {
        [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
        [mtlTextureArgEncoder setTexture:nil atIndex:0];
        [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
        [mtlTextureArgEncoder setTexture:nil atIndex:0];
      }
    }
    if (default_storage_mode == MTLResourceStorageModeManaged) {
      [texture_bindings_2d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
@@ -558,7 +608,7 @@ void MetalDevice::erase_allocation(device_memory &mem)
  if (it != metal_mem_map.end()) {
    MetalMem *mmem = it->second.get();
-    /* blank out reference to MetalMem* in the launch params (fixes crash T94736) */
+    /* blank out reference to MetalMem* in the launch params (fixes crash #94736) */
    if (mmem->pointer_index >= 0) {
      device_ptr *pointers = (device_ptr *)&launch_params;
      pointers[mmem->pointer_index] = 0;
@@ -712,7 +762,6 @@ void MetalDevice::generic_free(device_memory &mem)
      mem.shared_pointer = 0;
      /* Free device memory. */
      delayed_free_list.push_back(mmem.mtlBuffer);
      mmem.mtlBuffer = nil;
    }
@@ -947,7 +996,7 @@ void MetalDevice::global_free(device_memory &mem)
 void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
 {
-  generic_alloc(mem);
+  MetalDevice::MetalMem *mmem = generic_alloc(mem);
  generic_copy_to(mem);
  /* Resize once */
@@ -956,27 +1005,32 @@ void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
    /* Allocate some slots in advance, to reduce amount
     * of re-allocations. */
    texture_info.resize(round_up(slot + 1, 128));
    texture_slot_map.resize(round_up(slot + 1, 128));
  }
  mem.info.data = (uint64_t)mem.device_pointer;
  /* Set Mapping and tag that we need to (re-)upload to device */
  texture_info[slot] = mem.info;
  uint64_t offset = slot * sizeof(void *);
  [mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
  [mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
  texture_info[slot].data = *(uint64_t *)((uint64_t)buffer_bindings_1d.contents + offset);
  texture_slot_map[slot] = nil;
  need_texture_info = true;
 }
 void MetalDevice::tex_alloc(device_texture &mem)
 {
  /* Check that dimensions fit within maximum allowable size.
   * If 1D texture is allocated, use 1D buffer.
   * See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */
-  if (mem.data_width > 16384 || mem.data_height > 16384) {
+  if (mem.data_height > 0) {
-    set_error(string_printf(
+    if (mem.data_width > 16384 || mem.data_height > 16384) {
-        "Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
+      set_error(string_printf(
-        mem.data_width,
+          "Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
-        mem.data_height));
+          mem.data_width,
-    return;
+          mem.data_height));
      return;
    }
  }
  MTLStorageMode storage_mode = MTLStorageModeManaged;
  if (@available(macos 10.15, *)) {
    if ([mtlDevice hasUnifiedMemory] &&
@@ -1116,8 +1170,9 @@ void MetalDevice::tex_alloc(device_texture &mem)
                  bytesPerRow:src_pitch];
  }
  else {
    assert(0);
    /* 1D texture, using linear memory. */
    tex_alloc_as_buffer(mem);
    return;
  }
  mem.device_pointer = (device_ptr)mtlTexture;
@@ -1141,17 +1196,22 @@ void MetalDevice::tex_alloc(device_texture &mem)
    ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
    if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
      if (texture_bindings_2d) {
        delayed_free_list.push_back(buffer_bindings_1d);
        delayed_free_list.push_back(texture_bindings_2d);
        delayed_free_list.push_back(texture_bindings_3d);
-        stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
+        stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
                       texture_bindings_3d.allocatedSize);
      }
      buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
                                                  options:default_storage_mode];
      texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
                                                   options:default_storage_mode];
      texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
                                                   options:default_storage_mode];
-      stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
+      stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
                      texture_bindings_3d.allocatedSize);
    }
  }
@@ -1178,12 +1238,18 @@ void MetalDevice::tex_alloc(device_texture &mem)
 void MetalDevice::tex_free(device_texture &mem)
 {
  if (mem.data_depth == 0 && mem.data_height == 0) {
    generic_free(mem);
    return;
  }
  if (metal_mem_map.count(&mem)) {
    std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
    MetalMem &mmem = *metal_mem_map.at(&mem);
    assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
-    texture_slot_map[mem.slot] = nil;
+    if (texture_slot_map[mem.slot] == mmem.mtlTexture)
      texture_slot_map[mem.slot] = nil;
    if (mmem.mtlTexture) {
      /* Free bindless texture. */
@@ -1231,6 +1297,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    if (@available(macos 11.0, *)) {
      if (bvh->params.top_level) {
        bvhMetalRT = bvh_metal;
        // allocate required buffers for BLAS array
        uint64_t count = bvhMetalRT->blas_array.size();
        uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
        blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
        stats.mem_alloc(blas_buffer.allocatedSize);
        for (uint64_t i = 0; i < count; ++i) {
          [mtlBlasArgEncoder setArgumentBuffer:blas_buffer
                                        offset:i * mtlBlasArgEncoder.encodedLength];
          [mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
        }
        count = bvhMetalRT->blas_lookup.size();
        bufferSize = sizeof(uint32_t) * count;
        blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
                                                    options:default_storage_mode];
        stats.mem_alloc(blas_lookup_buffer.allocatedSize);
        memcpy([blas_lookup_buffer contents],
               bvhMetalRT -> blas_lookup.data(),
               blas_lookup_buffer.allocatedSize);
        if (default_storage_mode == MTLResourceStorageModeManaged) {
          [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
          [blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
        }
      }
    }
  }
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -19,6 +19,8 @@ enum {
  METALRT_FUNC_SHADOW_BOX,
  METALRT_FUNC_LOCAL_TRI,
  METALRT_FUNC_LOCAL_BOX,
  METALRT_FUNC_LOCAL_TRI_PRIM,
  METALRT_FUNC_LOCAL_BOX_PRIM,
  METALRT_FUNC_CURVE_RIBBON,
  METALRT_FUNC_CURVE_RIBBON_SHADOW,
  METALRT_FUNC_CURVE_ALL,
@@ -28,7 +30,13 @@ enum {
  METALRT_FUNC_NUM
 };
-enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
+enum {
  METALRT_TABLE_DEFAULT,
  METALRT_TABLE_SHADOW,
  METALRT_TABLE_LOCAL,
  METALRT_TABLE_LOCAL_PRIM,
  METALRT_TABLE_NUM
 };
 /* Pipeline State Object types */
 enum MetalPipelineType {
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -87,6 +87,9 @@ struct ShaderCache {
          break;
      }
    }
    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
  }
  ~ShaderCache();
@@ -521,6 +524,8 @@ void MetalKernelPipeline::compile()
          "__anyhit__cycles_metalrt_shadow_all_hit_box",
          "__anyhit__cycles_metalrt_local_hit_tri",
          "__anyhit__cycles_metalrt_local_hit_box",
          "__anyhit__cycles_metalrt_local_hit_tri_prim",
          "__anyhit__cycles_metalrt_local_hit_box_prim",
          "__intersection__curve_ribbon",
          "__intersection__curve_ribbon_shadow",
          "__intersection__curve_all",
@@ -611,11 +616,17 @@ void MetalKernelPipeline::compile()
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         nil];
    table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
        arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
                         nil];
    NSMutableSet *unique_functions = [NSMutableSet
        setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];
    if (kernel_has_intersection(device_kernel)) {
      linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -25,6 +25,7 @@ class MetalDeviceQueue : public DeviceQueue {
  virtual int num_concurrent_states(const size_t) const override;
  virtual int num_concurrent_busy_states(const size_t) const override;
  virtual int num_sort_partition_elements() const override;
  virtual bool supports_local_atomic_sort() const override;
  virtual void init_execution() override;
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -315,6 +315,11 @@ int MetalDeviceQueue::num_sort_partition_elements() const
  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
 }
 bool MetalDeviceQueue::supports_local_atomic_sort() const
 {
  return metal_device_->use_local_atomic_sort();
 }
 void MetalDeviceQueue::init_execution()
 {
  /* Synchronize all textures and memory copies before executing task. */
@@ -472,11 +477,21 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
  [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_3d
                                            offset:0
                                           atIndex:1];
  [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->buffer_bindings_1d
                                            offset:0
                                           atIndex:2];
  if (@available(macos 12.0, *)) {
    if (metal_device_->use_metalrt) {
      if (metal_device_->bvhMetalRT) {
        id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
-        [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
+        [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:3];
        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
                                                  offset:0
                                                 atIndex:8];
        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
                                                  offset:0
                                                 atIndex:9];
      }
      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@@ -486,13 +501,13 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
                                                              atIndex:1];
          [metal_device_->mtlAncillaryArgEncoder
              setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table]
-                                   atIndex:3 + table];
+                                   atIndex:4 + table];
          [mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table]
                                          usage:MTLResourceUsageRead];
        }
        else {
          [metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
-                                                                      atIndex:3 + table];
+                                                                      atIndex:4 + table];
        }
      }
    }
@@ -527,6 +542,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (bvhMetalRT) {
        /* Mark all Accelerations resources as used */
        [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
        [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
                                        usage:MTLResourceUsageRead];
        [mtlComputeCommandEncoder useResource:metal_device_->blas_lookup_buffer
                                        usage:MTLResourceUsageRead];
        [mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
                                         count:bvhMetalRT->blas_array.size()
                                         usage:MTLResourceUsageRead];
@@ -553,13 +572,24 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      /* See parallel_active_index.h for why this amount of shared memory is needed.
       * Rounded up to 16 bytes for Metal */
      shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
      [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
      break;
    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
      int key_count = metal_device_->launch_params.data.max_shaders;
      shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
      break;
    }
    default:
      break;
  }
  if (shared_mem_bytes) {
    assert(shared_mem_bytes <= 32 * 1024);
    [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
  }
  MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
      divide_up(work_size, num_threads_per_block), 1, 1);
  MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
@@ -848,6 +878,7 @@ void MetalDeviceQueue::prepare_resources(DeviceKernel kernel)
  /* ancillaries */
  [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead];
  [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_3d usage:MTLResourceUsageRead];
  [mtlComputeEncoder_ useResource:metal_device_->buffer_bindings_1d usage:MTLResourceUsageRead];
 }
 id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -64,6 +64,12 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
    return METAL_GPU_INTEL;
  }
  else if (strstr(device_name, "AMD")) {
    /* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */
    if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) {
      if (atoi(str)) {
        return METAL_GPU_UNKNOWN;
      }
    }
    return METAL_GPU_AMD;
  }
  else if (strstr(device_name, "Apple")) {
@@ -96,6 +102,15 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    return usable_devices;
  }
  /* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
   * one. This can be overridden with CYCLES_METAL_FORCE_INTEL. */
  bool has_usable_amd_gpu = false;
  if (@available(macos 12.3, *)) {
    for (id<MTLDevice> device in MTLCopyAllDevices()) {
      has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD);
    }
  }
  metal_printf("Usable Metal devices:\n");
  for (id<MTLDevice> device in MTLCopyAllDevices()) {
    string device_name = get_device_name(device);
@@ -111,8 +126,10 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    }
 #  if defined(MAC_OS_VERSION_13_0)
-    if (@available(macos 13.0, *)) {
+    if (!has_usable_amd_gpu) {
-      usable |= (vendor == METAL_GPU_INTEL);
+      if (@available(macos 13.0, *)) {
        usable |= (vendor == METAL_GPU_INTEL);
      }
    }
 #  endif
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -854,12 +854,14 @@ bool OptiXDevice::load_osl_kernels()
        context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
  }
  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
  vector<OptixStackSizes> osl_stack_size(osl_groups.size());
  /* Update SBT with new entries. */
  sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
  }
  for (size_t i = 0; i < osl_groups.size(); ++i) {
    if (osl_groups[i] != NULL) {
@@ -907,13 +909,15 @@ bool OptiXDevice::load_osl_kernels()
                                     0,
                                     &pipelines[PIP_SHADE]));
    const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
                                      stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
    unsigned int dss = 0;
    for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
      dss = std::max(dss, osl_stack_size[i].dssDC);
    }
    optix_assert(optixPipelineSetStackSize(
-        pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
+        pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
  }
  return !have_error();
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -112,6 +112,13 @@ class DeviceQueue {
    return 65536;
  }
  /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
   * INTEGRATOR_SORT_WRITE_PASS)? */
  virtual bool supports_local_atomic_sort() const
  {
    return false;
  }
  /* Initialize execution of kernels on this queue.
   *
   * Will, for example, load all data required by the kernels from Device to global or path state.
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -71,6 +71,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
          device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
      integrator_shader_sort_prefix_sum_(
          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
      integrator_shader_sort_partition_key_offsets_(
          device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
      integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
      integrator_next_shadow_path_index_(
          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
@@ -207,33 +209,45 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
                                                                num_sort_partitions_);
-  /* Allocate arrays for shader sorting. */
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
-  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+    /* Allocate array for partitioned shader sorting using local atomics. */
-  if (integrator_shader_sort_counter_.size() < sort_buckets) {
+    const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
-    integrator_shader_sort_counter_.alloc(sort_buckets);
+    if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
-    integrator_shader_sort_counter_.zero_to_device();
+      integrator_shader_sort_partition_key_offsets_.alloc(num_offsets);
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+      integrator_shader_sort_partition_key_offsets_.zero_to_device();
        (int *)integrator_shader_sort_counter_.device_pointer;
    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
    integrator_shader_sort_prefix_sum_.zero_to_device();
  }
  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
    if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
      integrator_shader_raytrace_sort_counter_.zero_to_device();
      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
    }
    integrator_state_gpu_.sort_partition_key_offsets =
        (int *)integrator_shader_sort_partition_key_offsets_.device_pointer;
  }
  else {
    /* Allocate arrays for shader sorting. */
    const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
    if (integrator_shader_sort_counter_.size() < sort_buckets) {
      integrator_shader_sort_counter_.alloc(sort_buckets);
      integrator_shader_sort_counter_.zero_to_device();
      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
          (int *)integrator_shader_sort_counter_.device_pointer;
-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+      integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
-    if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
+      integrator_shader_sort_prefix_sum_.zero_to_device();
-      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+    }
-      integrator_shader_mnee_sort_counter_.zero_to_device();
+
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
        integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
        integrator_shader_raytrace_sort_counter_.zero_to_device();
        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
            (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
      }
    }
    if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
      if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
        integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
        integrator_shader_mnee_sort_counter_.zero_to_device();
        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
            (int *)integrator_shader_mnee_sort_counter_.device_pointer;
      }
    }
  }
 }
@@ -451,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
    work_size = num_queued;
    d_path_index = queued_paths_.device_pointer;
-    compute_sorted_queued_paths(
+    compute_sorted_queued_paths(kernel, num_paths_limit);
        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
  }
  else if (num_queued < work_size) {
    work_size = num_queued;
@@ -511,11 +524,26 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
  }
 }
-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
                                                   DeviceKernel queued_kernel,
                                                   const int num_paths_limit)
 {
  int d_queued_kernel = queued_kernel;
  /* Launch kernel to fill the active paths arrays. */
  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
    const int work_size = kernel_max_active_main_path_index(queued_kernel);
    device_ptr d_queued_paths = queued_paths_.device_pointer;
    int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
    DeviceKernelArguments args(
        &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
    return;
  }
  device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
  device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
  assert(d_counter != 0 && d_prefix_sum != 0);
@@ -552,7 +580,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
                               &d_prefix_sum,
                               &d_queued_kernel);
-    queue_->enqueue(kernel, work_size, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, work_size, args);
  }
 }
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -70,9 +70,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);
  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel,
+  void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit);
                                   DeviceKernel queued_kernel,
                                   const int num_paths_limit);
  void compact_main_paths(const int num_active_paths);
  void compact_shadow_paths();
@@ -135,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  device_vector<int> integrator_shader_raytrace_sort_counter_;
  device_vector<int> integrator_shader_mnee_sort_counter_;
  device_vector<int> integrator_shader_sort_prefix_sum_;
  device_vector<int> integrator_shader_sort_partition_key_offsets_;
  /* Path split. */
  device_vector<int> integrator_next_main_path_index_;
  device_vector<int> integrator_next_shadow_path_index_;
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -886,7 +886,7 @@ int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) c
 {
  /* Special trick for fast navigation: schedule multiple samples during fast navigation
   * (which will prefer to use lower resolution to keep up with refresh rate). This gives more
-   * usable visual feedback for artists. There are a couple of tricks though. */
+   * usable visual feedback for artists. */
  if (is_denoise_active_during_update()) {
    /* When denoising is used during navigation prefer using a higher resolution with less samples
@@ -896,25 +896,12 @@ int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) c
    return 1;
  }
-  if (resolution_divider <= pixel_size_) {
+  /* Schedule samples equal to the resolution divider up to a maximum of 4.
-    /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
+   * The idea is to have enough information on the screen by increasing the sample count as the
-     * the sample count at this resolution division, but instead assists in the calculation of
+   * resolution is decreased. */
-     * the resolution divider. */
+  /* NOTE: Changing this formula will change the formula in
-    return 1;
+   * `RenderScheduler::calculate_resolution_divider_for_time()`. */
-  }
+  return min(max(1, resolution_divider / pixel_size_), 4);
  if (resolution_divider == pixel_size_ * 2) {
    /* When resolution divider is the previous step to the final resolution, schedule two samples.
     * This is so that rendering on lower resolution does not exceed time that it takes to render
     * first sample at the full resolution. */
    return 2;
  }
  /* Always render 4 samples, even if scene is configured for less.
   * The idea here is to have enough information on the screen. Resolution divider of 2 allows us
   * to have 4 time extra samples, so overall worst case timing is the same as the final resolution
   * at one sample. */
  return 4;
 }
 bool RenderScheduler::work_need_adaptive_filter() const
@@ -1100,9 +1087,10 @@ void RenderScheduler::update_start_resolution_divider()
  /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
   * render time is somewhere on a boundary between two resolutions. */
-  /* Never increase resolution to higher than the pixel size (which is possible if the scene is
+  /* Don't let resolution drop below the desired one. It's better to be slow than provide an
-   * simple and compute device is fast). */
+   * unreadable viewport render. */
-  start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
+  start_resolution_divider_ = min(resolution_divider_for_update,
                                  default_start_resolution_divider_);
  VLOG_WORK << "Calculated resolution divider is " << start_resolution_divider_;
 }
@@ -1187,24 +1175,24 @@ void RenderScheduler::check_time_limit_reached()
 int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
 {
-  /* TODO(sergey): There should a non-iterative analytical formula here. */
+  const double ratio_between_times = actual_time / desired_time;
-  int resolution_divider = 1;
+  /* We can pass `ratio_between_times` to `get_num_samples_during_navigation()` to get our
   * navigation samples because the equation for calculating the resolution divider is as follows:
   * `actual_time / desired_time = sqr(resolution_divider) / sample_count`.
   * While `resolution_divider` is less than or equal to 4, `resolution_divider = sample_count`
   * (This relationship is determined in `get_num_samples_during_navigation()`). With some
   * substitution we end up with `actual_time / desired_time = resolution_divider` while the
   * resolution divider is less than or equal to 4. Once the resolution divider increases above 4,
   * the relationship of `actual_time / desired_time = resolution_divider` is no longer true,
   * however the sample count retrieved from `get_num_samples_during_navigation()` is still
   * accurate if we continue using this assumption. It should be noted that the interaction between
   * `pixel_size`, sample count, and resolution divider are automatically accounted for and that's
   * why `pixel_size` isn't included in any of the equations. */
  const int navigation_samples = get_num_samples_during_navigation(
      ceil_to_int(ratio_between_times));
-  /* This algorithm iterates through resolution dividers until a divider is found that achieves
+  return ceil_to_int(sqrt(navigation_samples * ratio_between_times));
   * the desired render time. A limit of default_start_resolution_divider_ is put in place as the
   * maximum resolution divider to avoid an unreadable viewport due to a low resolution.
   * pre_resolution_division_samples and post_resolution_division_samples are used in this
   * calculation to better predict the performance impact of changing resolution divisions as
   * the sample count can also change between resolution divisions. */
  while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
    int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
    resolution_divider = resolution_divider * 2;
    int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
    actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
  }
  return resolution_divider;
 }
 int calculate_resolution_divider_for_resolution(int width, int height, int resolution)
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -412,11 +412,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
  # warn for other versions
  if((CUDA_VERSION STREQUAL "101") OR
     (CUDA_VERSION STREQUAL "102") OR
-     (CUDA_VERSION_MAJOR STREQUAL "11"))
+     (CUDA_VERSION_MAJOR STREQUAL "11") OR
     (CUDA_VERSION_MAJOR STREQUAL "12"))
  else()
    message(WARNING
      "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-      "build may succeed but only CUDA 11, 10.2 and 10.1 have been tested")
+      "build may succeed but only CUDA 12, 11, 10.2 and 10.1 have been tested")
  endif()
  # build for each arch
@@ -514,6 +515,16 @@ if(WITH_CYCLES_CUDA_BINARIES)
      else()
        message(STATUS "CUDA binaries for ${arch} require CUDA 10 or earlier, skipped.")
      endif()
    elseif(${arch} MATCHES ".*_3.")
      if(DEFINED CUDA11_NVCC_EXECUTABLE)
        set(cuda_nvcc_executable ${CUDA11_NVCC_EXECUTABLE})
        set(cuda_toolkit_root_dir ${CUDA11_TOOLKIT_ROOT_DIR})
      elseif("${CUDA_VERSION}" LESS 120) # Support for sm_35, sm_37 was removed in CUDA 12
        set(cuda_nvcc_executable ${CUDA_NVCC_EXECUTABLE})
        set(cuda_toolkit_root_dir ${CUDA_TOOLKIT_ROOT_DIR})
      else()
        message(STATUS "CUDA binaries for ${arch} require CUDA 11 or earlier, skipped.")
      endif()
    elseif(${arch} MATCHES ".*_7." AND "${CUDA_VERSION}" LESS 100)
      message(STATUS "CUDA binaries for ${arch} require CUDA 10.0+, skipped.")
    elseif(${arch} MATCHES ".*_8.")
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -661,7 +661,8 @@ ccl_device void bsdf_blur(KernelGlobals kg, ccl_private ShaderClosure *sc, float
 #endif
 }
-ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd, ccl_private const ShaderClosure *sc)
+ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd,
                                       ccl_private const ShaderClosure *sc)
 {
  Spectrum albedo = sc->weight;
  /* Some closures include additional components such as Fresnel terms that cause their albedo to
@@ -685,7 +686,7 @@ ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd, ccl_pri
      albedo *= ((ccl_private const PrincipledSheenBsdf *)sc)->avg_value;
      break;
    case CLOSURE_BSDF_HAIR_PRINCIPLED_ID:
-      albedo *= bsdf_principled_hair_albedo(sc);
+      albedo *= bsdf_principled_hair_albedo(sd, sc);
      break;
    default:
      break;
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -478,10 +478,18 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
  return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
 }
-ccl_device Spectrum bsdf_principled_hair_albedo(ccl_private const ShaderClosure *sc)
+ccl_device Spectrum bsdf_principled_hair_albedo(ccl_private const ShaderData *sd,
                                                ccl_private const ShaderClosure *sc)
 {
  ccl_private PrincipledHairBSDF *bsdf = (ccl_private PrincipledHairBSDF *)sc;
-  return exp(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
+
  const float cos_theta_o = cos_from_sin(dot(sd->wi, safe_normalize(sd->dPdu)));
  const float cos_gamma_o = cos_from_sin(bsdf->extra->geom.w);
  const float f = fresnel_dielectric_cos(cos_theta_o * cos_gamma_o, bsdf->eta);
  const float roughness_scale = bsdf_principled_hair_albedo_roughness_scale(bsdf->v);
  /* TODO(lukas): Adding the Fresnel term here as a workaround until the proper refactor. */
  return exp(-sqrt(bsdf->sigma) * roughness_scale) + make_spectrum(f);
 }
 ccl_device_inline Spectrum
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -519,14 +519,6 @@ ccl_device int bsdf_microfacet_ggx_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 /* Required to maintain OSL interface. */
 ccl_device int bsdf_microfacet_ggx_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
 {
  bsdf->alpha_y = bsdf->alpha_x;
  return bsdf_microfacet_ggx_setup(bsdf);
 }
 ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
                                                 ccl_private const ShaderData *sd)
 {
@@ -613,14 +605,6 @@ ccl_device int bsdf_microfacet_beckmann_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
 /* Required to maintain OSL interface. */
 ccl_device int bsdf_microfacet_beckmann_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
 {
  bsdf->alpha_y = bsdf->alpha_x;
  return bsdf_microfacet_beckmann_setup(bsdf);
 }
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ccl_private MicrofacetBsdf *bsdf)
 {
  bsdf->alpha_x = saturatef(bsdf->alpha_x);
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -90,8 +90,10 @@ ccl_device float schlick_fresnel(float u)
 }
 /* Calculate the fresnel color, which is a blend between white and the F0 color */
-ccl_device_forceinline Spectrum
+ccl_device_forceinline Spectrum interpolate_fresnel_color(float3 L,
-interpolate_fresnel_color(float3 L, float3 H, float ior, Spectrum F0)
+                                                          float3 H,
                                                          float ior,
                                                          Spectrum F0)
 {
  /* Compute the real Fresnel term and remap it from real_F0..1 to F0..1.
   * The reason why we use this remapping instead of directly doing the
--- a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -10,7 +10,7 @@
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #  define KERNEL_STUB
 #else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+/* SSE optimization disabled for now on 32 bit, see bug #36316. */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
--- a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -10,7 +10,7 @@
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #  define KERNEL_STUB
 #else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+/* SSE optimization disabled for now on 32 bit, see bug #36316. */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #    define __KERNEL_SSE2__
 #  endif
--- a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -10,7 +10,7 @@
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #  define KERNEL_STUB
 #else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+/* SSE optimization disabled for now on 32 bit, see bug #36316. */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -5,13 +5,14 @@
 CCL_NAMESPACE_BEGIN
-#ifdef WITH_NANOVDB
+#if !defined __KERNEL_METAL__
-#  define NDEBUG /* Disable "assert" in device code */
+#  ifdef WITH_NANOVDB
-#  define NANOVDB_USE_INTRINSICS
+#    define NDEBUG /* Disable "assert" in device code */
-#  include "nanovdb/NanoVDB.h"
+#    define NANOVDB_USE_INTRINSICS
-#  include "nanovdb/util/SampleFromVoxels.h"
+#    include "nanovdb/NanoVDB.h"
 #    include "nanovdb/util/SampleFromVoxels.h"
 #  endif
 #endif
 /* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */
 ccl_device float cubic_w0(float a)
 {
@@ -126,7 +127,7 @@ kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, fl
 #ifdef WITH_NANOVDB
 template<typename T, typename S>
 ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tricubic_nanovdb(
-    S &s, float x, float y, float z)
+    ccl_private S &s, float x, float y, float z)
 {
  float px = floorf(x);
  float py = floorf(y);
@@ -157,13 +158,19 @@ ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tric
                g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1))));
 }
 #  if defined(__KERNEL_METAL__)
 template<typename T>
 __attribute__((noinline)) typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_nanovdb(
    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 #  else
 template<typename T>
 ccl_device_noinline typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_nanovdb(
    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 #  endif
 {
  using namespace nanovdb;
-  NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+  ccl_global NanoGrid<T> *const grid = (ccl_global NanoGrid<T> *)info.data;
  typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
  AccessorType acc = grid->getAccessor();
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -401,6 +401,72 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
 }
 ccl_gpu_kernel_postfix
 ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
                             int num_states,
                             int partition_size,
                             int num_states_limit,
                             ccl_global int *indices,
                             int kernel_index)
 {
 #if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
  int max_shaders = context.launch_params_metal.data.max_shaders;
  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
                                           kernel_integrator_state.path.queued_kernel;
  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
                                           kernel_integrator_state.path.shader_sort_key;
  ccl_global int *key_offsets = (ccl_global int *)
                                    kernel_integrator_state.sort_partition_key_offsets;
  gpu_parallel_sort_bucket_pass(num_states,
                                partition_size,
                                max_shaders,
                                kernel_index,
                                d_queued_kernel,
                                d_shader_sort_key,
                                key_offsets,
                                (threadgroup int *)threadgroup_array,
                                metal_local_id,
                                metal_local_size,
                                metal_grid_id);
 #endif
 }
 ccl_gpu_kernel_postfix
 ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_sort_write_pass,
                             int num_states,
                             int partition_size,
                             int num_states_limit,
                             ccl_global int *indices,
                             int kernel_index)
 {
 #if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
  int max_shaders = context.launch_params_metal.data.max_shaders;
  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
                                           kernel_integrator_state.path.queued_kernel;
  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
                                           kernel_integrator_state.path.shader_sort_key;
  ccl_global int *key_offsets = (ccl_global int *)
                                    kernel_integrator_state.sort_partition_key_offsets;
  gpu_parallel_sort_write_pass(num_states,
                               partition_size,
                               max_shaders,
                               kernel_index,
                               num_states_limit,
                               indices,
                               d_queued_kernel,
                               d_shader_sort_key,
                               key_offsets,
                               (threadgroup int *)threadgroup_array,
                               metal_local_id,
                               metal_local_size,
                               metal_grid_id);
 #endif
 }
 ccl_gpu_kernel_postfix
 ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_paths_array,
                             int num_states,
@@ -579,7 +645,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
                                                          const int y,
                                                          const half4 half_pixel)
 {
-  /* Work around HIP issue with half float display, see T92972. */
+  /* Work around HIP issue with half float display, see #92972. */
 #ifdef __KERNEL_HIP__
  ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
  out[0] = half_pixel.x;
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -178,7 +178,7 @@ __device__
                                         simd_lane_index, \
                                         simd_group_index, \
                                         num_simd_groups, \
-                                         simdgroup_offset)
+                                         (threadgroup int *)threadgroup_array)
 #elif defined(__KERNEL_ONEAPI__)
 #  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -19,6 +19,115 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
 #define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
 #if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
 #  define atomic_store_local(p, x) \
    atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
 #  define atomic_load_local(p) \
    atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
 ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
                                                     const uint partition_size,
                                                     const uint max_shaders,
                                                     const uint queued_kernel,
                                                     ccl_global ushort *d_queued_kernel,
                                                     ccl_global uint *d_shader_sort_key,
                                                     ccl_global int *partition_key_offsets,
                                                     ccl_gpu_shared int *buckets,
                                                     const ushort local_id,
                                                     const ushort local_size,
                                                     const ushort grid_id)
 {
  /* Zero the bucket sizes. */
  if (local_id < max_shaders) {
    atomic_store_local(&buckets[local_id], 0);
  }
  ccl_gpu_syncthreads();
  /* Determine bucket sizes within the partitions. */
  const uint partition_start = partition_size * uint(grid_id);
  const uint partition_end = min(num_states, partition_start + partition_size);
  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
       state_index += uint(local_size)) {
    ushort kernel_index = d_queued_kernel[state_index];
    if (kernel_index == queued_kernel) {
      uint key = d_shader_sort_key[state_index] % max_shaders;
      atomic_fetch_and_add_uint32(&buckets[key], 1);
    }
  }
  ccl_gpu_syncthreads();
  /* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
  if (local_id == 0) {
    int offset = 0;
    for (int i = 0; i < max_shaders; i++) {
      partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
      offset = offset + atomic_load_local(&buckets[i]);
    }
    /* Store the number of active states in this partition. */
    partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
  }
 }
 ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
                                                    const uint partition_size,
                                                    const uint max_shaders,
                                                    const uint queued_kernel,
                                                    const int num_states_limit,
                                                    ccl_global int *indices,
                                                    ccl_global ushort *d_queued_kernel,
                                                    ccl_global uint *d_shader_sort_key,
                                                    ccl_global int *partition_key_offsets,
                                                    ccl_gpu_shared int *local_offset,
                                                    const ushort local_id,
                                                    const ushort local_size,
                                                    const ushort grid_id)
 {
  /* Calculate each partition's global offset from the prefix sum of the active state counts per
   * partition. */
  if (local_id < max_shaders) {
    int partition_offset = 0;
    for (int i = 0; i < uint(grid_id); i++) {
      int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
      partition_offset += partition_key_count;
    }
    ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
    atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
  }
  ccl_gpu_syncthreads();
  /* Write the sorted active indices. */
  const uint partition_start = partition_size * uint(grid_id);
  const uint partition_end = min(num_states, partition_start + partition_size);
  ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
       state_index += uint(local_size)) {
    ushort kernel_index = d_queued_kernel[state_index];
    if (kernel_index == queued_kernel) {
      uint key = d_shader_sort_key[state_index] % max_shaders;
      int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
      if (index < num_states_limit) {
        indices[index] = state_index;
      }
    }
  }
 }
 #endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */
 template<typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint state_index,
--- a/intern/cycles/kernel/device/metal/bvh.h
+++ b/intern/cycles/kernel/device/metal/bvh.h
@@ -172,17 +172,14 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
    kernel_assert(!"Invalid ift_local");
    return false;
  }
-#  endif
+  if (is_null_intersection_function_table(metal_ancillaries->ift_local_prim)) {
-
+    if (local_isect) {
-  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
+      local_isect->num_hits = 0;
-  metalrt_intersector_type metalrt_intersect;
+    }
-
+    kernel_assert(!"Invalid ift_local_prim");
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+    return false;
  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
  if (triangle_only) {
    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
  }
 #  endif
  MetalRTIntersectionLocalPayload payload;
  payload.self = ray->self;
@@ -195,14 +192,48 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
  }
  payload.result = false;
-  typename metalrt_intersector_type::result_type intersection;
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
 #  if defined(__METALRT_MOTION__)
  metalrt_intersector_type metalrt_intersect;
  typename metalrt_intersector_type::result_type intersection;
  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
  if (triangle_only) {
    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
  }
  intersection = metalrt_intersect.intersect(
      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
 #  else
  metalrt_blas_intersector_type metalrt_intersect;
  typename metalrt_blas_intersector_type::result_type intersection;
  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
  if (triangle_only) {
    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
  }
  // if we know we are going to get max one hit, like for random-sss-walk we can
  // optimize and accept the first hit
  if (max_hits == 1) {
    metalrt_intersect.accept_any_intersection(true);
  }
  int blas_index = metal_ancillaries->blas_userID_to_index_lookUp[local_object];
  // transform the ray into object's local space
  Transform itfm = kernel_data_fetch(objects, local_object).itfm;
  r.origin = transform_point(&itfm, r.origin);
  r.direction = transform_direction(&itfm, r.direction);
  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
+      r,
      metal_ancillaries->blas_accel_structs[blas_index].blas,
      metal_ancillaries->ift_local_prim,
      payload);
 #  endif
  if (lcg_state) {
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -105,10 +105,11 @@ struct kernel_gpu_##name \
 { \
  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
  void run(thread MetalKernelContext& context, \
-           threadgroup int *simdgroup_offset, \
+           threadgroup atomic_int *threadgroup_array, \
           const uint metal_global_id, \
           const ushort metal_local_id, \
           const ushort metal_local_size, \
           const ushort metal_grid_id, \
           uint simdgroup_size, \
           uint simd_lane_index, \
           uint simd_group_index, \
@@ -117,22 +118,24 @@ struct kernel_gpu_##name \
 kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
                                constant MetalAncillaries *_metal_ancillaries, \
-                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]], \
                                const uint metal_global_id [[thread_position_in_grid]], \
                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
                                const ushort metal_local_size [[threads_per_threadgroup]], \
                                const ushort metal_grid_id    [[threadgroup_position_in_grid]], \
                                uint simdgroup_size [[threads_per_simdgroup]], \
                                uint simd_lane_index [[thread_index_in_simdgroup]], \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+  params_struct->run(context, threadgroup_array, metal_global_id, metal_local_id, metal_local_size, metal_grid_id, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
 } \
 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
-                  threadgroup int *simdgroup_offset, \
+                  threadgroup atomic_int *threadgroup_array, \
                  const uint metal_global_id, \
                  const ushort metal_local_id, \
                  const ushort metal_local_size, \
                  const ushort metal_grid_id, \
                  uint simdgroup_size, \
                  uint simd_lane_index, \
                  uint simd_group_index, \
@@ -263,18 +266,34 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,
 #  if defined(__METALRT_MOTION__)
 #    define METALRT_TAGS instancing, instance_motion, primitive_motion
 #    define METALRT_BLAS_TAGS , primitive_motion
 #  else
 #    define METALRT_TAGS instancing
 #    define METALRT_BLAS_TAGS
 #  endif /* __METALRT_MOTION__ */
 typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
 typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
 typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
 #  if defined(__METALRT_MOTION__)
 typedef acceleration_structure<primitive_motion> metalrt_blas_as_type;
 typedef intersection_function_table<triangle_data, primitive_motion> metalrt_blas_ift_type;
 typedef metal::raytracing::intersector<triangle_data, primitive_motion>
    metalrt_blas_intersector_type;
 #  else
 typedef acceleration_structure<> metalrt_blas_as_type;
 typedef intersection_function_table<triangle_data> metalrt_blas_ift_type;
 typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_type;
 #  endif
 #endif /* __METALRT__ */
 /* texture bindings and sampler setup */
 struct Buffer1DParamsMetal {
  device float *buf;
 };
 struct Texture2DParamsMetal {
  texture2d<float, access::sample> tex;
 };
@@ -282,15 +301,25 @@ struct Texture3DParamsMetal {
  texture3d<float, access::sample> tex;
 };
 #ifdef __METALRT__
 struct MetalRTBlasWrapper {
  metalrt_blas_as_type blas;
 };
 #endif
 struct MetalAncillaries {
  device Texture2DParamsMetal *textures_2d;
  device Texture3DParamsMetal *textures_3d;
  device Buffer1DParamsMetal *buffers;
 #ifdef __METALRT__
  metalrt_as_type accel_struct;
  metalrt_ift_type ift_default;
  metalrt_ift_type ift_shadow;
  metalrt_ift_type ift_local;
  metalrt_blas_ift_type ift_local_prim;
  constant MetalRTBlasWrapper *blas_accel_structs;
  constant int *blas_userID_to_index_lookUp;
 #endif
 };
--- a/intern/cycles/kernel/device/metal/context_begin.h
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -3,6 +3,13 @@
 // clang-format off
 #ifdef WITH_NANOVDB
 #  define NDEBUG /* Disable "assert" in device code */
 #  define NANOVDB_USE_INTRINSICS
 #  include "nanovdb/NanoVDB.h"
 #  include "nanovdb/util/SampleFromVoxels.h"
 #endif
 /* Open the Metal kernel context class
 * Necessary to access resource bindings */
 class MetalKernelContext {
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -139,6 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
 #endif
 }
 [[intersection(triangle, triangle_data )]] TriangleIntersectionResult
 __anyhit__cycles_metalrt_local_hit_tri_prim(
    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
    ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
    uint primitive_id [[primitive_id]],
    float2 barycentrics [[barycentric_coord]],
    float ray_tmax [[distance]])
 {
  //instance_id, aka the user_id has been removed. If we take this function we optimized the
  //SSS for starting traversal from a primitive acceleration structure instead of the root of the global AS.
  //this means we will always be intersecting the correct object no need for the userid to check
  return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
      launch_params_metal, payload, payload.local_object, primitive_id, barycentrics, ray_tmax);
 }
 [[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
 __anyhit__cycles_metalrt_local_hit_tri(
    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
@@ -163,6 +177,17 @@ __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
  return result;
 }
 [[intersection(bounding_box, triangle_data )]] BoundingBoxIntersectionResult
 __anyhit__cycles_metalrt_local_hit_box_prim(const float ray_tmax [[max_distance]])
 {
  /* unused function */
  BoundingBoxIntersectionResult result;
  result.distance = ray_tmax;
  result.accept = false;
  result.continue_search = false;
  return result;
 }
 template<uint intersection_type>
 bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -372,6 +372,16 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
          break;
        }
        case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
          oneapi_call(
              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
          break;
        }
        case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
          oneapi_call(
              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
          break;
        }
        case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
          oneapi_call(kg,
                      cgh,
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -132,6 +132,9 @@ typedef struct IntegratorStateGPU {
  /* Index of main path which will be used by a next shadow catcher split.  */
  ccl_global int *next_main_path_index;
  /* Partition/key offsets used when writing sorted active indices. */
  ccl_global int *sort_partition_key_offsets;
  /* Divisor used to partition active indices by locality when sorting by material.  */
  uint sort_partition_divisor;
 } IntegratorStateGPU;
--- a/Show More
+++ b/Show More