Metal: Resolve high memory pressure on EEVEE render

When EEVEE is rendering multiple samples via eevee_draw_scene, the command submission and in-flight memory pressure would grow until all samples completed, due to lack of intermediate flushing of GPU work and memory. This patch adds a command flush and memory clear for this case which occurs with high TAA sample counts during saving, similar to the process in EEVEE_render_draw. Authored by Apple: Michael Parkin-White Pull Request: blender/blender#107221
Fix #106264 : Color picker broken with Wayland & AMD GPU
2023-04-24 10:01:01 +02:00 · 2023-04-21 21:31:26 +10:00 · 2023-04-21 21:03:55 +10:00 · 2023-04-21 20:56:37 +10:00 · 2023-04-21 20:43:43 +10:00 · 2023-04-21 15:40:03 +05:30
1962 changed files with 2100745 additions and 14308 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -236,6 +236,8 @@ ForEachMacros:
  - LOOP_UNSELECTED_POINTS
  - LOOP_VISIBLE_KEYS
  - LOOP_VISIBLE_POINTS
+  - LIGHT_FOREACH_BEGIN_DIRECTIONAL
+  - LIGHT_FOREACH_BEGIN_LOCAL
  - LISTBASE_CIRCULAR_BACKWARD_BEGIN
  - LISTBASE_CIRCULAR_FORWARD_BEGIN
  - LISTBASE_FOREACH
--- a/.gitea/default_merge_message/REBASE_TEMPLATE.md
+++ b/.gitea/default_merge_message/REBASE_TEMPLATE.md
@@ -2,4 +2,4 @@ ${CommitTitle}

 ${CommitBody}

-Pull Request #${PullRequestIndex}
+Pull Request: https://projects.blender.org/blender/blender/pulls/${PullRequestIndex}
--- a/.gitea/default_merge_message/SQUASH_TEMPLATE.md
+++ b/.gitea/default_merge_message/SQUASH_TEMPLATE.md
@@ -1,3 +1,3 @@
 ${PullRequestTitle}

-Pull Request #${PullRequestIndex}
+Pull Request: https://projects.blender.org/blender/blender/pulls/${PullRequestIndex}
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,5 +1,4 @@
-This repository is only used as a mirror of git.blender.org. Blender development happens on
-https://developer.blender.org.
+This repository is only used as a mirror. Blender development happens on projects.blender.org.

 To get started with contributing code, please see:
 https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -15,8 +15,7 @@ staleLabel: stale
 # Comment to post when closing a stale Issue or Pull Request.
 closeComment: >
  This issue has been automatically closed, because this repository is only
-  used as a mirror of git.blender.org. Blender development happens on
-  developer.blender.org.
+  used as a mirror. Blender development happens on projects.blender.org.

  To get started contributing code, please read:
  https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.gitignore
+++ b/.gitignore
@@ -39,7 +39,7 @@ Desktop.ini
 /doc/python_api/rst/bmesh.ops.rst

 # in-source lib downloads
-/build_files/build_environment/downloads
+/build_files/build_environment/downloads/

 # in-source buildbot signing configuration
 /build_files/buildbot/codesign/config_server.py
@@ -48,4 +48,20 @@ Desktop.ini
 waveletNoiseTile.bin

 # testing environment
-/Testing
+/Testing/
+
+# Translations.
+/locale/user-config.py
+
+# External repositories.
+/scripts/addons/
+/scripts/addons_contrib/
+
+# Ignore old submodules directories.
+# Eventually need to get rid of those, but for the first time of transition
+# avoid indidents when the folders exists after bisect and developers staging
+# them by accident.
+/release/scripts/addons/
+/release/datafiles/locale/
+/release/scripts/addons_contrib/
+/source/tools/
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,20 +0,0 @@
-[submodule "release/scripts/addons"]
-	path = release/scripts/addons
-	url = ../blender-addons.git
-	branch = master
-	ignore = all
-[submodule "release/scripts/addons_contrib"]
-	path = release/scripts/addons_contrib
-	url = ../blender-addons-contrib.git
-	branch = master
-	ignore = all
-[submodule "release/datafiles/locale"]
-	path = release/datafiles/locale
-	url = ../blender-translations.git
-	branch = master
-	ignore = all
-[submodule "source/tools"]
-	path = source/tools
-	url = ../blender-dev-tools.git
-	branch = master
-	ignore = all
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -524,7 +524,7 @@ endif()
 if(NOT APPLE)
  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" ON)
  option(WITH_CYCLES_HIP_BINARIES      "Build Cycles AMD HIP binaries" OFF)
-  set(CYCLES_HIP_BINARIES_ARCH gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
+  set(CYCLES_HIP_BINARIES_ARCH gfx900 gfx906 gfx90c gfx902 gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
  mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
  mark_as_advanced(CYCLES_HIP_BINARIES_ARCH)
 endif()
@@ -625,8 +625,10 @@ mark_as_advanced(

 # Vulkan
 option(WITH_VULKAN_BACKEND "Enable Vulkan as graphics backend (only for development)" OFF)
+option(WITH_VULKAN_GUARDEDALLOC "Use guardedalloc for host allocations done inside Vulkan (development option)" OFF)
 mark_as_advanced(
  WITH_VULKAN_BACKEND
+  WITH_VULKAN_GUARDEDALLOC
 )

 # Metal
@@ -952,21 +954,6 @@ endif()
 # -----------------------------------------------------------------------------
 # Check if Sub-modules are Cloned

-if(WITH_INTERNATIONAL)
-  file(GLOB RESULT "${CMAKE_SOURCE_DIR}/release/datafiles/locale")
-  list(LENGTH RESULT DIR_LEN)
-  if(DIR_LEN EQUAL 0)
-    message(
-      WARNING
-      "Translation path '${CMAKE_SOURCE_DIR}/release/datafiles/locale' is missing, "
-      "This is a 'git submodule', which are known not to work with bridges to other version "
-      "control systems."
-    )
-    set(TRANSLATIONS_FOUND OFF)
-    set_and_warn_library_found("Translations" TRANSLATIONS_FOUND WITH_INTERNATIONAL)
-  endif()
-endif()
-
 if(WITH_PYTHON)
  # While we have this as an '#error' in 'bpy_capi_utils.h',
  # upgrading Python tends to cause confusion for users who build.
@@ -982,14 +969,14 @@ if(WITH_PYTHON)
    )
  endif()

-  file(GLOB RESULT "${CMAKE_SOURCE_DIR}/release/scripts/addons")
+  file(GLOB RESULT "${CMAKE_SOURCE_DIR}/scripts/addons")
  list(LENGTH RESULT DIR_LEN)
  if(DIR_LEN EQUAL 0)
    message(
      WARNING
-      "Addons path '${CMAKE_SOURCE_DIR}/release/scripts/addons' is missing, "
-      "This is a 'git submodule', which are known not to work with bridges to other version "
-      "control systems: * CONTINUING WITHOUT ADDONS *"
+      "Addons path '${CMAKE_SOURCE_DIR}/scripts/addons' is missing. "
+      "This is an external repository which needs to be checked out. Use `make update` to do so. "
+      "* CONTINUING WITHOUT ADDONS *"
    )
  endif()
 endif()
--- a/36
+++ b/36
@@ -69,7 +69,7 @@ Static Source Code Checking
   * check_cmake:           Runs our own cmake file checker which detects errors in the cmake file list definitions.
   * check_pep8:            Checks all Python script are pep8 which are tagged to use the stricter formatting.
   * check_mypy:            Checks all Python scripts using mypy,
-                            see: source/tools/check_source/check_mypy_config.py scripts which are included.
+                            see: tools/check_source/check_mypy_config.py scripts which are included.

 Documentation Checking

@@ -85,7 +85,7 @@ Spell Checkers
   * check_spelling_osl:    Check for spelling errors (OSL only).
   * check_spelling_py:     Check for spelling errors (Python only).

-   Note: an additional word-list is maintained at: 'source/tools/check_source/check_spelling_c_config.py'
+   Note: an additional word-list is maintained at: 'tools/check_source/check_spelling_c_config.py'

   Note: that spell checkers can take a 'CHECK_SPELLING_CACHE' filepath argument,
   so re-running does not need to re-check unchanged files.
@@ -299,7 +299,11 @@ else
 	ifneq ("$(wildcard $(DEPS_BUILD_DIR)/build.ninja)","")
 		DEPS_BUILD_COMMAND:=ninja
 	else
-		DEPS_BUILD_COMMAND:=make -s
+		ifeq ($(OS), Darwin)
+			DEPS_BUILD_COMMAND:=make -s
+		else
+			DEPS_BUILD_COMMAND:="$(BLENDER_DIR)/build_files/build_environment/linux/make_deps_wrapper.sh" -s
+		endif
 	endif
 endif

@@ -398,7 +402,7 @@ endif

 deps: .FORCE
 	@echo
-	@echo Configuring dependencies in \"$(DEPS_BUILD_DIR)\"
+	@echo Configuring dependencies in \"$(DEPS_BUILD_DIR)\", install to \"$(DEPS_INSTALL_DIR)\"

 	@cmake -H"$(DEPS_SOURCE_DIR)" \
 	       -B"$(DEPS_BUILD_DIR)" \
@@ -486,22 +490,22 @@ check_smatch: .FORCE
 	$(PYTHON) "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_smatch.py"

 check_mypy: .FORCE
-	@$(PYTHON) "$(BLENDER_DIR)/source/tools/check_source/check_mypy.py"
+	@$(PYTHON) "$(BLENDER_DIR)/tools/check_source/check_mypy.py"

 check_wiki_file_structure: .FORCE
 	@PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    "$(BLENDER_DIR)/source/tools/check_wiki/check_wiki_file_structure.py"
+	    "$(BLENDER_DIR)/tools/check_wiki/check_wiki_file_structure.py"

 check_spelling_py: .FORCE
 	@cd "$(BUILD_DIR)" ; \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    "$(BLENDER_DIR)/source/tools/check_source/check_spelling.py" \
-	    "$(BLENDER_DIR)/release/scripts"
+	    "$(BLENDER_DIR)/tools/check_source/check_spelling.py" \
+	    "$(BLENDER_DIR)/scripts"

 check_spelling_c: .FORCE
 	@cd "$(BUILD_DIR)" ; \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    "$(BLENDER_DIR)/source/tools/check_source/check_spelling.py" \
+	    "$(BLENDER_DIR)/tools/check_source/check_spelling.py" \
 	    --cache-file=$(CHECK_SPELLING_CACHE) \
 	    "$(BLENDER_DIR)/source" \
 	    "$(BLENDER_DIR)/intern/cycles" \
@@ -511,21 +515,21 @@ check_spelling_c: .FORCE
 check_spelling_osl: .FORCE
 	@cd "$(BUILD_DIR)" ; \
 	PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    "$(BLENDER_DIR)/source/tools/check_source/check_spelling.py" \
+	    "$(BLENDER_DIR)/tools/check_source/check_spelling.py" \
 	    --cache-file=$(CHECK_SPELLING_CACHE) \
 	    "$(BLENDER_DIR)/intern/cycles/kernel/shaders"

 check_descriptions: .FORCE
 	@$(BLENDER_BIN) --background -noaudio --factory-startup --python \
-	    "$(BLENDER_DIR)/source/tools/check_source/check_descriptions.py"
+	    "$(BLENDER_DIR)/tools/check_source/check_descriptions.py"

 check_deprecated: .FORCE
 	@PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    source/tools/check_source/check_deprecated.py
+	    tools/check_source/check_deprecated.py

 check_licenses: .FORCE
 	@PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    "$(BLENDER_DIR)/source/tools/check_source/check_licenses.py" \
+	    "$(BLENDER_DIR)/tools/check_source/check_licenses.py" \
 	    "--show-headers=$(SHOW_HEADERS)"

 check_pep8: .FORCE
@@ -534,7 +538,7 @@ check_pep8: .FORCE

 check_cmake: .FORCE
 	@PYTHONIOENCODING=utf_8 $(PYTHON) \
-	    source/tools/check_source/check_cmake_consistency.py
+	    tools/check_source/check_cmake_consistency.py


 # -----------------------------------------------------------------------------
@@ -572,8 +576,8 @@ update_code: .FORCE
 	@$(PYTHON) ./build_files/utils/make_update.py --no-libraries

 format: .FORCE
-	@PATH="${LIBDIR}/llvm/bin/:$(PATH)" $(PYTHON) source/tools/utils_maintenance/clang_format_paths.py $(PATHS)
-	@$(PYTHON) source/tools/utils_maintenance/autopep8_format_paths.py --autopep8-command="$(AUTOPEP8)" $(PATHS)
+	@PATH="${LIBDIR}/llvm/bin/:$(PATH)" $(PYTHON) tools/utils_maintenance/clang_format_paths.py $(PATHS)
+	@$(PYTHON) tools/utils_maintenance/autopep8_format_paths.py --autopep8-command="$(AUTOPEP8)" $(PATHS)


 # -----------------------------------------------------------------------------
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Development
 -----------

 - [Build Instructions](https://wiki.blender.org/wiki/Building_Blender)
- [Code Review & Bug Tracker](https://developer.blender.org)
+- [Code Review & Bug Tracker](https://projects.blender.org)
 - [Developer Forum](https://devtalk.blender.org)
 - [Developer Documentation](https://wiki.blender.org)

--- a/build_files/build_environment/CMakeLists.txt
+++ b/build_files/build_environment/CMakeLists.txt
@@ -78,12 +78,7 @@ include(cmake/tbb.cmake)
 include(cmake/python.cmake)
 include(cmake/llvm.cmake)
 include(cmake/osl.cmake)
-option(USE_PIP_NUMPY "Install NumPy using pip wheel instead of building from source" OFF)
-if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "x86_64"))
-  set(USE_PIP_NUMPY ON)
-else()
-  include(cmake/numpy.cmake)
-endif()
+include(cmake/numpy.cmake)
 include(cmake/python_site_packages.cmake)
 include(cmake/package_python.cmake)
 include(cmake/openimageio.cmake)
--- a/build_files/build_environment/cmake/download.cmake
+++ b/build_files/build_environment/cmake/download.cmake
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-or-later

 ## Update and uncomment this in the release branch
-# set(BLENDER_VERSION 3.1)
+set(BLENDER_VERSION 3.5)

 function(download_source dep)
  set(TARGET_FILE ${${dep}_FILE})
--- a/build_files/build_environment/cmake/gmp.cmake
+++ b/build_files/build_environment/cmake/gmp.cmake
@@ -22,7 +22,7 @@ elseif(UNIX AND NOT APPLE)
  )
 endif()

-# Boolean crashes with Arm assembly, see T103423.
+# Boolean crashes with Arm assembly, see #103423.
 if(BLENDER_PLATFORM_ARM)
  set(GMP_OPTIONS
    ${GMP_OPTIONS}
--- a/build_files/build_environment/cmake/igc.cmake
+++ b/build_files/build_environment/cmake/igc.cmake
@@ -40,7 +40,8 @@ ExternalProject_Add(external_igc_llvm
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0004-OpenCL-support-cl_ext_float_atomics.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0005-OpenCL-Add-cl_khr_integer_dot_product.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0001-Memory-leak-fix-for-Managed-Static-Mutex.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch
+    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch &&
+    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0003-Add-missing-include-limit-in-benchmark.patch
 )
 add_dependencies(
  external_igc_llvm
@@ -55,9 +56,6 @@ ExternalProject_Add(external_igc_spirv_translator
  CONFIGURE_COMMAND echo .
  BUILD_COMMAND echo .
  INSTALL_COMMAND echo .
-  PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0001-update-SPIR-V-headers-for-SPV_INTEL_split_barrier.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0002-Add-support-for-split-barriers-extension-SPV_INTEL_s.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0003-Support-cl_bf16_conversions.patch
 )
 add_dependencies(
  external_igc_spirv_translator
--- a/build_files/build_environment/cmake/python.cmake
+++ b/build_files/build_environment/cmake/python.cmake
@@ -88,6 +88,19 @@ else()
    export LDFLAGS=${PYTHON_LDFLAGS} &&
    export PKG_CONFIG_PATH=${LIBDIR}/ffi/lib/pkgconfig)

+  # NOTE: untested on APPLE so far.
+  if(NOT APPLE)
+    set(PYTHON_CONFIGURE_EXTRA_ARGS
+      ${PYTHON_CONFIGURE_EXTRA_ARGS}
+      # Used on most release Linux builds (Fedora for e.g.),
+      # increases build times noticeably with the benefit of a modest speedup at runtime.
+      --enable-optimizations
+      # While LTO is OK when building on the same system, it's incompatible across GCC versions,
+      # making it impractical for developers to build against, so keep it disabled.
+      # `--with-lto`
+    )
+  endif()
+
  ExternalProject_Add(external_python
    URL file://${PACKAGE_DIR}/${PYTHON_FILE}
    DOWNLOAD_DIR ${DOWNLOAD_DIR}
--- a/build_files/build_environment/cmake/python_site_packages.cmake
+++ b/build_files/build_environment/cmake/python_site_packages.cmake
@@ -38,15 +38,6 @@ ExternalProject_Add(external_python_site_packages
  --no-binary :all:
 )

-if(USE_PIP_NUMPY)
-  # Use only wheel (and not build from source) to stop NumPy from linking against buggy
-  # Accelerate framework backend on macOS. Official wheels are built with OpenBLAS.
-  ExternalProject_Add_Step(external_python_site_packages after_install
-    COMMAND ${PYTHON_BINARY} -m pip install --no-cache-dir numpy==${NUMPY_VERSION} --only-binary :all:
-    DEPENDEES install
-  )
-endif()
-
 add_dependencies(
  external_python_site_packages
  external_python
--- a/build_files/build_environment/cmake/versions.cmake
+++ b/build_files/build_environment/cmake/versions.cmake
@@ -165,9 +165,9 @@ set(OPENMP_URI https://github.com/llvm/llvm-project/releases/download/llvmorg-${
 set(OPENMP_HASH_TYPE MD5)
 set(OPENMP_FILE openmp-${OPENMP_VERSION}.src.tar.xz)

-set(OPENIMAGEIO_VERSION v2.4.6.0)
+set(OPENIMAGEIO_VERSION v2.4.9.0)
 set(OPENIMAGEIO_URI https://github.com/OpenImageIO/oiio/archive/refs/tags/${OPENIMAGEIO_VERSION}.tar.gz)
-set(OPENIMAGEIO_HASH c7acc1b9a8fda04ef48f7de1feda4dae)
+set(OPENIMAGEIO_HASH 7da92a7d6029921a8599a977ff1efa2a)
 set(OPENIMAGEIO_HASH_TYPE MD5)
 set(OPENIMAGEIO_FILE OpenImageIO-${OPENIMAGEIO_VERSION}.tar.gz)

@@ -668,9 +668,9 @@ set(SPIRV_HEADERS_FILE SPIR-V-Headers-${SPIRV_HEADERS_VERSION}.tar.gz)
 # compiler, the versions used are taken from the following location
 # https://github.com/intel/intel-graphics-compiler/releases

-set(IGC_VERSION 1.0.12149.1)
+set(IGC_VERSION 1.0.13064.7)
 set(IGC_URI https://github.com/intel/intel-graphics-compiler/archive/refs/tags/igc-${IGC_VERSION}.tar.gz)
-set(IGC_HASH 44f67f24e3bc5130f9f062533abf8154782a9d0a992bc19b498639a8521ae836)
+set(IGC_HASH a929abd4cca2b293961ec0437ee4b3b2147bd3b2c8a3c423af78c0c359b2e5ae)
 set(IGC_HASH_TYPE SHA256)
 set(IGC_FILE igc-${IGC_VERSION}.tar.gz)

@@ -690,15 +690,15 @@ set(IGC_LLVM_FILE ${IGC_LLVM_VERSION}.tar.gz)
 #
 # WARNING WARNING WARNING

-set(IGC_OPENCL_CLANG_VERSION 363a5262d8c7cff3fb28f3bdb5d85c8d7e91c1bb)
+set(IGC_OPENCL_CLANG_VERSION ee31812ea8b89d08c2918f045d11a19bd33525c5)
 set(IGC_OPENCL_CLANG_URI https://github.com/intel/opencl-clang/archive/${IGC_OPENCL_CLANG_VERSION}.tar.gz)
-set(IGC_OPENCL_CLANG_HASH aa8cf72bb239722ce8ce44f79413c6887ecc8ca18477dd520aa5c4809756da9a)
+set(IGC_OPENCL_CLANG_HASH 1db6735bbcfaa31e8a9ba39f121d6bafa806ea8919e9f56782d6aaa67771ddda)
 set(IGC_OPENCL_CLANG_HASH_TYPE SHA256)
 set(IGC_OPENCL_CLANG_FILE opencl-clang-${IGC_OPENCL_CLANG_VERSION}.tar.gz)

-set(IGC_VCINTRINSICS_VERSION v0.5.0)
+set(IGC_VCINTRINSICS_VERSION v0.11.0)
 set(IGC_VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/refs/tags/${IGC_VCINTRINSICS_VERSION}.tar.gz)
-set(IGC_VCINTRINSICS_HASH 70bb47c5e32173cf61514941e83ae7c7eb4485e6d2fca60cfa1f50d4f42c41f2)
+set(IGC_VCINTRINSICS_HASH e5acd5626ce7fa6d41ce154c50ac805eda734ee66af94ef28e680ac2ad81bb9f)
 set(IGC_VCINTRINSICS_HASH_TYPE SHA256)
 set(IGC_VCINTRINSICS_FILE vc-intrinsics-${IGC_VCINTRINSICS_VERSION}.tar.gz)

@@ -714,9 +714,9 @@ set(IGC_SPIRV_TOOLS_HASH 6e19900e948944243024aedd0a201baf3854b377b9cc7a386553bc1
 set(IGC_SPIRV_TOOLS_HASH_TYPE SHA256)
 set(IGC_SPIRV_TOOLS_FILE SPIR-V-Tools-${IGC_SPIRV_TOOLS_VERSION}.tar.gz)

-set(IGC_SPIRV_TRANSLATOR_VERSION a31ffaeef77e23d500b3ea3d35e0c42ff5648ad9)
+set(IGC_SPIRV_TRANSLATOR_VERSION d739c01d65ec00dee64dedd40deed805216a7193)
 set(IGC_SPIRV_TRANSLATOR_URI https://github.com/KhronosGroup/SPIRV-LLVM-Translator/archive/${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
-set(IGC_SPIRV_TRANSLATOR_HASH 9e26c96a45341b8f8af521bacea20e752623346340addd02af95d669f6e89252)
+set(IGC_SPIRV_TRANSLATOR_HASH ddc0cc9ccbe59dadeaf291012d59de142b2e9f2b124dbb634644d39daddaa13e)
 set(IGC_SPIRV_TRANSLATOR_HASH_TYPE SHA256)
 set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)

@@ -724,15 +724,15 @@ set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.
 ### Intel Graphics Compiler DEPS END ###
 ########################################

-set(GMMLIB_VERSION intel-gmmlib-22.1.8)
+set(GMMLIB_VERSION intel-gmmlib-22.3.0)
 set(GMMLIB_URI https://github.com/intel/gmmlib/archive/refs/tags/${GMMLIB_VERSION}.tar.gz)
-set(GMMLIB_HASH bf23e9a3742b4fb98c7666c9e9b29f3219e4b2fb4d831aaf4eed71f5e2d17368)
+set(GMMLIB_HASH c1f33e1519edfc527127baeb0436b783430dfd256c643130169a3a71dc86aff9)
 set(GMMLIB_HASH_TYPE SHA256)
 set(GMMLIB_FILE ${GMMLIB_VERSION}.tar.gz)

-set(OCLOC_VERSION 22.38.24278)
+set(OCLOC_VERSION 22.49.25018.21)
 set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC_VERSION}.tar.gz)
-set(OCLOC_HASH db0c542fccd651e6404b15a74d46027f1ce0eda8dc9e25a40cbb6c0faef257ee)
+set(OCLOC_HASH 92362dae08b503a34e5d3820ed284198c452bcd5e7504d90eb69887b20492c06)
 set(OCLOC_HASH_TYPE SHA256)
 set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)

--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -517,7 +517,7 @@ OPENEXR_FORCE_REBUILD=false
 OPENEXR_SKIP=false
 _with_built_openexr=false

-OIIO_VERSION="2.4.6.0"
+OIIO_VERSION="2.4.9.0"
 OIIO_VERSION_SHORT="2.4"
 OIIO_VERSION_MIN="2.2.0"
 OIIO_VERSION_MEX="2.5.0"
--- a/build_files/build_environment/linux/make_deps_wrapper.sh
+++ b/build_files/build_environment/linux/make_deps_wrapper.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+# This script ensures:
+# - One dependency is built at a time.
+# - That dependency uses all available cores.
+#
+# Without this, simply calling `make -j$(nproc)` from the `${CMAKE_BUILD_DIR}/deps/`
+# directory will build many projects at once.
+#
+# This is undesirable for the following reasons:
+#
+# - The output from projects is mixed together,
+#   making it difficult to track down the cause of a build failure.
+#
+# - Larger dependencies such as LLVM can bottleneck the build process,
+#   making it necessary to cancel the build and manually run build commands in each directory.
+#
+# - Building many projects at once means canceling (Control-C) can lead to the build being in an undefined state.
+#   It's possible canceling happens as a patch is being applied or files are being copied.
+#   (steps that aren't part of the compilation process where it's typically safe to cancel).
+
+if [[ -z "$MY_MAKE_CALL_LEVEL" ]]; then
+  export MY_MAKE_CALL_LEVEL=0
+  export MY_MAKEFLAGS=$MAKEFLAGS
+
+  # Extract the jobs argument (`-jN`, `-j N`, `--jobs=N`).
+  add_next=0
+  for i in "$@"; do
+    case $i in
+      -j*)
+        export MY_JOBS_ARG=$i
+        if [ "$MY_JOBS_ARG" = "-j" ]; then
+          add_next=1
+        fi
+        ;;
+      --jobs=*)
+        shift # past argument=value
+        MY_JOBS_ARG=$i
+        ;;
+      *)
+        if (( add_next == 1 )); then
+          MY_JOBS_ARG="$MY_JOBS_ARG $i"
+          add_next=0
+        fi
+        ;;
+    esac
+  done
+  unset i add_next
+
+  if [[ -z "$MY_JOBS_ARG" ]]; then
+    MY_JOBS_ARG="-j$(nproc)"
+  fi
+  export MY_JOBS_ARG
+  # Support user defined `MAKEFLAGS`.
+  export MAKEFLAGS="$MY_MAKEFLAGS -j1"
+else
+  export MY_MAKE_CALL_LEVEL=$(( MY_MAKE_CALL_LEVEL + 1 ))
+  if (( MY_MAKE_CALL_LEVEL == 1 )); then
+    # Important to set jobs to 1, otherwise user defined jobs argument is used.
+    export MAKEFLAGS="$MY_MAKEFLAGS -j1"
+  elif (( MY_MAKE_CALL_LEVEL == 2 )); then
+    # This is the level used by each sub-project.
+    export MAKEFLAGS="$MY_MAKEFLAGS $MY_JOBS_ARG"
+  fi
+  # Else leave `MY_MAKEFLAGS` flags as-is, avoids setting a high number of jobs on recursive
+  # calls (which may easily run out of memory). Let the job-server handle the rest.
+fi
+
+# Useful for troubleshooting the wrapper.
+# echo "Call level: $MY_MAKE_CALL_LEVEL, args=$@".
+
+# Call actual make but ensure recursive calls run via this script.
+exec make MAKE="$0" "$@"
--- a/build_files/build_environment/patches/igc_opencl_clang.diff
+++ b/build_files/build_environment/patches/igc_opencl_clang.diff
@@ -1,7 +1,7 @@
 diff -Naur external_igc_opencl_clang.orig/CMakeLists.txt external_igc_opencl_clang/CMakeLists.txt
 --- external_igc_opencl_clang.orig/CMakeLists.txt	2022-03-16 05:51:10 -0600
 +++ external_igc_opencl_clang/CMakeLists.txt	2022-05-23 10:40:09 -0600
-@@ -126,22 +126,24 @@
+@@ -147,22 +147,24 @@
         )
     endif()
 
--- a/build_files/cmake/Modules/FindClang.cmake
+++ b/build_files/cmake/Modules/FindClang.cmake
@@ -80,6 +80,7 @@ set(_CLANG_FIND_COMPONENTS
  clangAST
  clangLex
  clangBasic
+  clangSupport
 )

 set(_CLANG_LIBRARIES)
@@ -94,7 +95,9 @@ foreach(COMPONENT ${_CLANG_FIND_COMPONENTS})
    PATH_SUFFIXES
      lib64 lib
    )
-  list(APPEND _CLANG_LIBRARIES "${CLANG_${UPPERCOMPONENT}_LIBRARY}")
+  if(CLANG_${UPPERCOMPONENT}_LIBRARY)
+    list(APPEND _CLANG_LIBRARIES "${CLANG_${UPPERCOMPONENT}_LIBRARY}")
+  endif()
 endforeach()


--- a/build_files/cmake/buildinfo.cmake
+++ b/build_files/cmake/buildinfo.cmake
@@ -23,19 +23,19 @@ if(EXISTS ${SOURCE_DIR}/.git)

  if(MY_WC_BRANCH STREQUAL "HEAD")
    # Detached HEAD, check whether commit hash is reachable
-    # in the master branch
+    # in the main branch
    execute_process(COMMAND git rev-parse --short=12 HEAD
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE MY_WC_HASH
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

-    execute_process(COMMAND git branch --list master blender-v* --contains ${MY_WC_HASH}
+    execute_process(COMMAND git branch --list main blender-v* --contains ${MY_WC_HASH}
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE _git_contains_check
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

    if(NOT _git_contains_check STREQUAL "")
-      set(MY_WC_BRANCH "master")
+      set(MY_WC_BRANCH "main")
    else()
      execute_process(COMMAND git show-ref --tags -d
                      WORKING_DIRECTORY ${SOURCE_DIR}
@@ -48,7 +48,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
                      OUTPUT_STRIP_TRAILING_WHITESPACE)

      if(_git_tag_hashes MATCHES "${_git_head_hash}")
-        set(MY_WC_BRANCH "master")
+        set(MY_WC_BRANCH "main")
      else()
        execute_process(COMMAND git branch --contains ${MY_WC_HASH}
                        WORKING_DIRECTORY ${SOURCE_DIR}
--- a/build_files/cmake/example_scripts/cmake_linux_install.sh
+++ b/build_files/cmake/example_scripts/cmake_linux_install.sh
@@ -11,11 +11,11 @@
 mkdir ~/blender-git
 cd ~/blender-git

-git clone http://git.blender.org/blender.git
+git clone https://projects.blender.org/blender/blender.git
 cd blender
 git submodule update --init --recursive
-git submodule foreach git checkout master
-git submodule foreach git pull --rebase origin master
+git submodule foreach git checkout main
+git submodule foreach git pull --rebase origin main

 # create build dir
 mkdir ~/blender-git/build-cmake
@@ -35,7 +35,7 @@ ln -s ~/blender-git/build-cmake/bin/blender ~/blender-git/blender/blender.bin
 echo ""
 echo "* Useful Commands *"
 echo "   Run Blender: ~/blender-git/blender/blender.bin"
-echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin master"
+echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin main"
 echo "   Reconfigure Blender: cd ~/blender-git/build-cmake ; cmake ."
 echo "   Build Blender: cd ~/blender-git/build-cmake ; make"
 echo ""
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -544,7 +544,7 @@ endfunction()
 function(setup_platform_linker_libs
  target
  )
-  # jemalloc must be early in the list, to be before pthread (see T57998)
+  # jemalloc must be early in the list, to be before pthread (see #57998).
  if(WITH_MEM_JEMALLOC)
    target_link_libraries(${target} ${JEMALLOC_LIBRARIES})
  endif()
@@ -1090,7 +1090,7 @@ function(msgfmt_simple
  add_custom_command(
    OUTPUT  ${_file_to}
    COMMAND ${CMAKE_COMMAND} -E make_directory ${_file_to_path}
-    COMMAND "$<TARGET_FILE:msgfmt>" ${_file_from} ${_file_to}
+    COMMAND ${CMAKE_COMMAND} -E env ${PLATFORM_ENV_BUILD} "$<TARGET_FILE:msgfmt>" ${_file_from} ${_file_to}
    DEPENDS msgfmt ${_file_from})

  set_source_files_properties(${_file_to} PROPERTIES GENERATED TRUE)
@@ -1299,16 +1299,29 @@ macro(windows_install_shared_manifest)
  endif()
  if(WINDOWS_INSTALL_DEBUG)
    set(WINDOWS_CONFIGURATIONS "${WINDOWS_CONFIGURATIONS};Debug")
-    list(APPEND WINDOWS_SHARED_MANIFEST_DEBUG ${WINDOWS_INSTALL_FILES})
  endif()
  if(WINDOWS_INSTALL_RELEASE)
-    list(APPEND WINDOWS_SHARED_MANIFEST_RELEASE ${WINDOWS_INSTALL_FILES})
    set(WINDOWS_CONFIGURATIONS "${WINDOWS_CONFIGURATIONS};Release;RelWithDebInfo;MinSizeRel")
  endif()
-  install(FILES ${WINDOWS_INSTALL_FILES}
-          CONFIGURATIONS ${WINDOWS_CONFIGURATIONS}
-          DESTINATION "./blender.shared"
-  )
+  if(NOT WITH_PYTHON_MODULE)
+    # Blender executable with manifest.
+    if(WINDOWS_INSTALL_DEBUG)
+      list(APPEND WINDOWS_SHARED_MANIFEST_DEBUG ${WINDOWS_INSTALL_FILES})
+    endif()
+    if(WINDOWS_INSTALL_RELEASE)
+      list(APPEND WINDOWS_SHARED_MANIFEST_RELEASE ${WINDOWS_INSTALL_FILES})
+    endif()
+    install(FILES ${WINDOWS_INSTALL_FILES}
+            CONFIGURATIONS ${WINDOWS_CONFIGURATIONS}
+            DESTINATION "./blender.shared"
+    )
+  else()
+    # Python module without manifest.
+    install(FILES ${WINDOWS_INSTALL_FILES}
+            CONFIGURATIONS ${WINDOWS_CONFIGURATIONS}
+            DESTINATION "./bpy"
+    )
+  endif()
 endmacro()

 macro(windows_generate_manifest)
@@ -1325,24 +1338,28 @@ macro(windows_generate_manifest)
 endmacro()

 macro(windows_generate_shared_manifest)
-  windows_generate_manifest(
-    FILES "${WINDOWS_SHARED_MANIFEST_DEBUG}"
-    OUTPUT "${CMAKE_BINARY_DIR}/Debug/blender.shared.manifest"
-    NAME "blender.shared"
-  )
-  windows_generate_manifest(
-    FILES "${WINDOWS_SHARED_MANIFEST_RELEASE}"
-    OUTPUT "${CMAKE_BINARY_DIR}/Release/blender.shared.manifest"
-    NAME "blender.shared"
-  )
-  install(
-    FILES ${CMAKE_BINARY_DIR}/Release/blender.shared.manifest
-    DESTINATION "./blender.shared"
-    CONFIGURATIONS Release;RelWithDebInfo;MinSizeRel
-  )
-  install(
-    FILES ${CMAKE_BINARY_DIR}/Debug/blender.shared.manifest
-    DESTINATION "./blender.shared"
-    CONFIGURATIONS Debug
-  )
+  if(WINDOWS_SHARED_MANIFEST_DEBUG)
+    windows_generate_manifest(
+      FILES "${WINDOWS_SHARED_MANIFEST_DEBUG}"
+      OUTPUT "${CMAKE_BINARY_DIR}/Debug/blender.shared.manifest"
+      NAME "blender.shared"
+    )
+    install(
+      FILES ${CMAKE_BINARY_DIR}/Debug/blender.shared.manifest
+      DESTINATION "./blender.shared"
+      CONFIGURATIONS Debug
+    )
+  endif()
+  if(WINDOWS_SHARED_MANIFEST_RELEASE)
+    windows_generate_manifest(
+      FILES "${WINDOWS_SHARED_MANIFEST_RELEASE}"
+      OUTPUT "${CMAKE_BINARY_DIR}/Release/blender.shared.manifest"
+      NAME "blender.shared"
+    )
+    install(
+      FILES ${CMAKE_BINARY_DIR}/Release/blender.shared.manifest
+      DESTINATION "./blender.shared"
+      CONFIGURATIONS Release;RelWithDebInfo;MinSizeRel
+    )
+  endif()
 endmacro()
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -440,7 +440,7 @@ string(APPEND PLATFORM_LINKFLAGS " -stdlib=libc++")
 # Make stack size more similar to Embree, required for Embree.
 string(APPEND PLATFORM_LINKFLAGS_EXECUTABLE " -Wl,-stack_size,0x100000")

-# Suppress ranlib "has no symbols" warnings (workaround for T48250)
+# Suppress ranlib "has no symbols" warnings (workaround for #48250).
 set(CMAKE_C_ARCHIVE_CREATE   "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
 set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
 # llvm-ranlib doesn't support this flag. Xcode's libtool does.
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -114,14 +114,15 @@ add_definitions(-D_WIN32_WINNT=0x603)
 # First generate the manifest for tests since it will not need the dependency on the CRT.
 configure_file(${CMAKE_SOURCE_DIR}/release/windows/manifest/blender.exe.manifest.in ${CMAKE_CURRENT_BINARY_DIR}/tests.exe.manifest @ONLY)

-if(WITH_WINDOWS_BUNDLE_CRT)
-  set(CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_SKIP TRUE)
-  set(CMAKE_INSTALL_UCRT_LIBRARIES TRUE)
-  set(CMAKE_INSTALL_OPENMP_LIBRARIES ${WITH_OPENMP})
-  include(InstallRequiredSystemLibraries)
+# Always detect CRT paths, but only manually install with WITH_WINDOWS_BUNDLE_CRT.
+set(CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_SKIP TRUE)
+set(CMAKE_INSTALL_UCRT_LIBRARIES TRUE)
+set(CMAKE_INSTALL_OPENMP_LIBRARIES ${WITH_OPENMP})
+include(InstallRequiredSystemLibraries)

+if(WITH_WINDOWS_BUNDLE_CRT)
  # ucrtbase(d).dll cannot be in the manifest, due to the way windows 10 handles
-  # redirects for this dll, for details see T88813.
+  # redirects for this dll, for details see #88813.
  foreach(lib ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS})
    string(FIND ${lib} "ucrtbase" pos)
    if(NOT pos EQUAL -1)
@@ -141,7 +142,9 @@ if(WITH_WINDOWS_BUNDLE_CRT)
  install(FILES ${CMAKE_BINARY_DIR}/blender.crt.manifest DESTINATION ./blender.crt)
  set(BUNDLECRT "<dependency><dependentAssembly><assemblyIdentity type=\"win32\" name=\"blender.crt\" version=\"1.0.0.0\" /></dependentAssembly></dependency>")
 endif()
-set(BUNDLECRT "${BUNDLECRT}<dependency><dependentAssembly><assemblyIdentity type=\"win32\" name=\"blender.shared\" version=\"1.0.0.0\" /></dependentAssembly></dependency>")
+if(NOT WITH_PYTHON_MODULE)
+  set(BUNDLECRT "${BUNDLECRT}<dependency><dependentAssembly><assemblyIdentity type=\"win32\" name=\"blender.shared\" version=\"1.0.0.0\" /></dependentAssembly></dependency>")
+endif()
 configure_file(${CMAKE_SOURCE_DIR}/release/windows/manifest/blender.exe.manifest.in ${CMAKE_CURRENT_BINARY_DIR}/blender.exe.manifest @ONLY)


@@ -295,7 +298,7 @@ unset(MATERIALX_LIB_FOLDER_EXISTS)
 if(NOT MSVC_CLANG                  AND # Available with MSVC 15.7+ but not for CLANG.
   NOT WITH_WINDOWS_SCCACHE        AND # And not when sccache is enabled
   NOT VS_CLANG_TIDY)                  # Clang-tidy does not like these options
-  add_compile_options(/experimental:external /external:templates- /external:I "${LIBDIR}" /external:W0)
+  add_compile_options(/experimental:external /external:I "${LIBDIR}" /external:W0)
 endif()

 # Add each of our libraries to our cmake_prefix_path so find_package() could work
@@ -901,11 +904,11 @@ endif()

 if(WINDOWS_PYTHON_DEBUG)
  # Include the system scripts in the blender_python_system_scripts project.
-  file(GLOB_RECURSE inFiles "${CMAKE_SOURCE_DIR}/release/scripts/*.*" )
+  file(GLOB_RECURSE inFiles "${CMAKE_SOURCE_DIR}/scripts/*.*" )
  add_custom_target(blender_python_system_scripts SOURCES ${inFiles})
  foreach(_source IN ITEMS ${inFiles})
    get_filename_component(_source_path "${_source}" PATH)
-    string(REPLACE "${CMAKE_SOURCE_DIR}/release/scripts/" "" _source_path "${_source_path}")
+    string(REPLACE "${CMAKE_SOURCE_DIR}/scripts/" "" _source_path "${_source_path}")
    string(REPLACE "/" "\\" _group_path "${_source_path}")
    source_group("${_group_path}" FILES "${_source}")
  endforeach()
@@ -940,7 +943,7 @@ if(WINDOWS_PYTHON_DEBUG)
    file(WRITE ${USER_PROPS_FILE} "<?xml version=\"1.0\" encoding=\"utf-8\"?>
 <Project DefaultTargets=\"Build\" xmlns=\"http://schemas.microsoft.com/developer/msbuild/2003\">
  <PropertyGroup>
-    <LocalDebuggerCommandArguments>-con --env-system-scripts \"${CMAKE_SOURCE_DIR}/release/scripts\" </LocalDebuggerCommandArguments>
+    <LocalDebuggerCommandArguments>-con --env-system-scripts \"${CMAKE_SOURCE_DIR}/scripts\" </LocalDebuggerCommandArguments>
  </PropertyGroup>
 </Project>")
  endif()
@@ -1040,7 +1043,7 @@ endif()

 # Environment variables to run precompiled executables that needed libraries.
 list(JOIN PLATFORM_BUNDLED_LIBRARY_DIRS ";" _library_paths)
-set(PLATFORM_ENV_BUILD_DIRS "${LIBDIR}/OpenImageIO/bin\;${LIBDIR}/boost/lib\;${LIBDIR}/openexr/bin\;${LIBDIR}/imath/bin\;${PATH}")
+set(PLATFORM_ENV_BUILD_DIRS "${LIBDIR}/tbb/bin\;${LIBDIR}/OpenImageIO/bin\;${LIBDIR}/boost/lib\;${LIBDIR}/openexr/bin\;${LIBDIR}/imath/bin\;${PATH}")
 set(PLATFORM_ENV_BUILD "PATH=${PLATFORM_ENV_BUILD_DIRS}")
 # Install needs the additional folders from PLATFORM_ENV_BUILD_DIRS as well, as tools like idiff and abcls use the release mode dlls
 set(PLATFORM_ENV_INSTALL "PATH=${CMAKE_INSTALL_PREFIX_WITH_CONFIG}/blender.shared/\;${PLATFORM_ENV_BUILD_DIRS}\;$ENV{PATH}")
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -1,53 +1,3 @@
-#
-# Used by Buildbot build pipeline make_update.py script only for now
-# We intended to update the make_update.py in the branches to use this file eventually
-#
-update-code:
-    git:
-        submodules:
-        -   branch: master
-            commit_id: HEAD
-            path: release/scripts/addons
-        -   branch: master
-            commit_id: HEAD
-            path: release/scripts/addons_contrib
-        -   branch: master
-            commit_id: HEAD
-            path: release/datafiles/locale
-        -   branch: master
-            commit_id: HEAD
-            path: source/tools
-    svn:
-        libraries:
-            darwin-arm64:
-                branch: trunk
-                commit_id: HEAD
-                path: lib/darwin_arm64
-            darwin-x86_64:
-                branch: trunk
-                commit_id: HEAD
-                path: lib/darwin
-            linux-x86_64:
-                branch: trunk
-                commit_id: HEAD
-                path: lib/linux_x86_64_glibc_228
-            windows-amd64:
-                branch: trunk
-                commit_id: HEAD
-                path: lib/win64_vc15
-        tests:
-            branch: trunk
-            commit_id: HEAD
-            path: lib/tests
-        benchmarks:
-            branch: trunk
-            commit_id: HEAD
-            path: lib/benchmarks
-        assets:
-            branch: trunk
-            commit_id: HEAD
-            path: lib/assets
-
 #
 # Buildbot only configs
 #
--- a/build_files/utils/make_bpy_wheel.py
+++ b/build_files/utils/make_bpy_wheel.py
@@ -58,7 +58,7 @@ Each Blender release supports one Python version, and the package is only compat
 ## Source Code

 * [Releases](https://download.blender.org/source/)
-* Repository: [git.blender.org/blender.git](https://git.blender.org/gitweb/gitweb.cgi/blender.git)
+* Repository: [projects.blender.org/blender/blender.git](https://projects.blender.org/blender/blender)

 ## Credits

--- a/build_files/utils/make_source_archive.py
+++ b/build_files/utils/make_source_archive.py
@@ -112,6 +112,7 @@ def create_manifest(
    print(f'Building manifest of files:  "{outpath}"...', end="", flush=True)
    with outpath.open("w", encoding="utf-8") as outfile:
        main_files_to_manifest(blender_srcdir, outfile)
+        assets_to_manifest(blender_srcdir, outfile)
        submodules_to_manifest(blender_srcdir, version, outfile)

        if packages_dir:
@@ -131,17 +132,27 @@ def submodules_to_manifest(
    skip_addon_contrib = version.is_release()
    assert not blender_srcdir.is_absolute()

-    for line in git_command("-C", blender_srcdir, "submodule"):
-        submodule = line.split()[1]
-
+    for submodule in ("scripts/addons", "scripts/addons_contrib"):
        # Don't use native slashes as GIT for MS-Windows outputs forward slashes.
-        if skip_addon_contrib and submodule == "release/scripts/addons_contrib":
+        if skip_addon_contrib and submodule == "scripts/addons_contrib":
            continue

        for path in git_ls_files(blender_srcdir / submodule):
            print(path, file=outfile)


+def assets_to_manifest(blender_srcdir: Path, outfile: TextIO) -> None:
+    assert not blender_srcdir.is_absolute()
+
+    assets_dir = blender_srcdir.parent / "lib" / "assets"
+    for path in assets_dir.glob("*"):
+        if path.name == "working":
+            continue
+        if path.name in SKIP_NAMES:
+            continue
+        print(path, file=outfile)
+
+
 def packages_to_manifest(outfile: TextIO, packages_dir: Path) -> None:
    for path in packages_dir.glob("*"):
        if not path.is_file():
@@ -172,7 +183,9 @@ def create_tarball(
    command += [
        "--transform",
        f"s,^{blender_srcdir.name}/,blender-{version}/,g",
-        "--use-compress-program=xz -9",
+        "--transform",
+        f"s,^lib/assets/,blender-{version}/release/datafiles/assets/,g",
+        "--use-compress-program=xz -1",
        "--create",
        f"--file={tarball}",
        f"--files-from={manifest}",
--- a/build_files/utils/make_update.py
+++ b/build_files/utils/make_update.py
@@ -16,14 +16,28 @@ import shutil
 import sys

 import make_utils
+from pathlib import Path
 from make_utils import call, check_output
+from urllib.parse import urljoin

 from typing import (
    List,
+    Iterable,
    Optional,
 )


+class Submodule:
+    path: str
+    branch: str
+    branch_fallback: str
+
+    def __init__(self, path: str, branch: str, branch_fallback: str) -> None:
+        self.path = path
+        self.branch = branch
+        self.branch_fallback = branch_fallback
+
+
 def print_stage(text: str) -> None:
    print("")
    print(text)
@@ -42,6 +56,7 @@ def parse_arguments() -> argparse.Namespace:
    parser.add_argument("--svn-branch", default=None)
    parser.add_argument("--git-command", default="git")
    parser.add_argument("--use-linux-libraries", action="store_true")
+    parser.add_argument("--architecture", type=str, choices=("x86_64", "amd64", "arm64",))
    return parser.parse_args()


@@ -51,6 +66,17 @@ def get_blender_git_root() -> str:
 # Setup for precompiled libraries and tests from svn.


+def get_effective_architecture(args: argparse.Namespace):
+    if args.architecture:
+        return args.architecture
+
+    # Check platform.version to detect arm64 with x86_64 python binary.
+    if "ARM64" in platform.version():
+        return "arm64"
+
+    return platform.machine().lower()
+
+
 def svn_update(args: argparse.Namespace, release_version: Optional[str]) -> None:
    svn_non_interactive = [args.svn_command, '--non-interactive']

@@ -58,11 +84,11 @@ def svn_update(args: argparse.Namespace, release_version: Optional[str]) -> None
    svn_url = make_utils.svn_libraries_base_url(release_version, args.svn_branch)

    # Checkout precompiled libraries
+    architecture = get_effective_architecture(args)
    if sys.platform == 'darwin':
-        # Check platform.version to detect arm64 with x86_64 python binary.
-        if platform.machine() == 'arm64' or ('ARM64' in platform.version()):
+        if architecture == 'arm64':
            lib_platform = "darwin_arm64"
-        elif platform.machine() == 'x86_64':
+        elif architecture == 'x86_64':
            lib_platform = "darwin"
        else:
            lib_platform = None
@@ -170,7 +196,7 @@ def git_update_skip(args: argparse.Namespace, check_remote_exists: bool = True)
        return "rebase or merge in progress, complete it first"

    # Abort if uncommitted changes.
-    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no'])
+    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no', '--ignore-submodules'])
    if len(changes) != 0:
        return "you have unstaged changes"

@@ -184,97 +210,296 @@ def git_update_skip(args: argparse.Namespace, check_remote_exists: bool = True)
    return ""


+def use_upstream_workflow(args: argparse.Namespace) -> bool:
+    return make_utils.git_remote_exist(args.git_command, "upstream")
+
+
+def work_tree_update_upstream_workflow(args: argparse.Namespace, use_fetch=True) -> str:
+    """
+    Update the Blender repository using the Github style of fork organization
+
+    Returns true if the current local branch has been updated to the upstream state.
+    Otherwise false is returned.
+    """
+
+    branch_name = make_utils.git_branch(args.git_command)
+
+    if use_fetch:
+        call((args.git_command, "fetch", "upstream"))
+
+    upstream_branch = f"upstream/{branch_name}"
+    if not make_utils.git_branch_exists(args.git_command, upstream_branch):
+        return "no_branch"
+
+    retcode = call((args.git_command, "merge", "--ff-only", upstream_branch), exit_on_error=False)
+    if retcode != 0:
+        return "Unable to fast forward\n"
+
+    return ""
+
+
+def work_tree_update(args: argparse.Namespace, use_fetch=True) -> str:
+    """
+    Update the Git working tree using the best strategy
+
+    This function detects whether it is a github style of fork remote organization is used, or
+    is it a repository which origin is an upstream.
+    """
+
+    if use_upstream_workflow(args):
+        message = work_tree_update_upstream_workflow(args, use_fetch)
+        if message != "no_branch":
+            return message
+
+        # If there is upstream configured but the local branch is not in the upstream, try to
+        # update the branch from the fork.
+
+    update_command = [args.git_command, "pull", "--rebase"]
+
+    call(update_command)
+
+    return ""
+
+
 # Update blender repository.
-def blender_update(args: argparse.Namespace) -> None:
+def blender_update(args: argparse.Namespace) -> str:
    print_stage("Updating Blender Git Repository")
-    call([args.git_command, "pull", "--rebase"])
+
+    return work_tree_update(args)


-# Update submodules.
-def submodules_update(
-        args: argparse.Namespace,
-        release_version: Optional[str],
-        branch: Optional[str],
-) -> str:
-    print_stage("Updating Submodules")
-    if make_utils.command_missing(args.git_command):
-        sys.stderr.write("git not found, can't update code\n")
-        sys.exit(1)
+def resolve_external_url(blender_url: str, repo_name: str) -> str:
+    return urljoin(blender_url + "/", "../" + repo_name)

-    # Update submodules to appropriate given branch,
-    # falling back to master if none is given and/or found in a sub-repository.
-    branch_fallback = "master"
+
+def external_script_copy_old_submodule_over(args: argparse.Namespace, directory_name: str) -> None:
+    blender_git_root = Path(get_blender_git_root())
+    scripts_dir = blender_git_root / "scripts"
+    external_dir = scripts_dir / directory_name
+
+    old_submodule_relative_dir = Path("release") / "scripts" / directory_name
+    print(f"Moving {old_submodule_relative_dir} to scripts/{directory_name} ...")
+
+    old_submodule_dir = blender_git_root / old_submodule_relative_dir
+    shutil.move(old_submodule_dir, external_dir)
+
+    # Remove old ".git" which is a file with path to a submodule bare repo inside of main
+    # repo .git/modules directory.
+    (external_dir / ".git").unlink()
+
+    bare_repo_relative_dir = Path(".git") / "modules" / "release" / "scripts" / directory_name
+    print(f"Copying {bare_repo_relative_dir} to scripts/{directory_name}/.git ...")
+    bare_repo_dir = blender_git_root / bare_repo_relative_dir
+    shutil.copytree(bare_repo_dir, external_dir / ".git")
+
+    git_config = external_dir / ".git" / "config"
+    call((args.git_command, "config", "--file", git_config, "--unset", "core.worktree"))
+
+
+def external_script_initialize_if_needed(args: argparse.Namespace,
+                                         repo_name: str,
+                                         directory_name: str) -> None:
+    """Initialize checkout of an external repository scripts directory"""
+
+    blender_git_root = Path(get_blender_git_root())
+    blender_dot_git = blender_git_root / ".git"
+    scripts_dir = blender_git_root / "scripts"
+    external_dir = scripts_dir / directory_name
+
+    if external_dir.exists():
+        return
+
+    print(f"Initializing scripts/{directory_name} ...")
+
+    old_submodule_dot_git = blender_git_root / "release" / "scripts" / directory_name / ".git"
+    if old_submodule_dot_git.exists() and blender_dot_git.is_dir():
+        external_script_copy_old_submodule_over(args, directory_name)
+        return
+
+    origin_name = "upstream" if use_upstream_workflow(args) else "origin"
+    blender_url = make_utils.git_get_remote_url(args.git_command, origin_name)
+    external_url = resolve_external_url(blender_url, repo_name)
+
+    # When running `make update` from a freshly cloned fork check whether the fork of the submodule is
+    # available, If not, switch to the submodule relative to the main blender repository.
+    if origin_name == "origin" and not make_utils.git_is_remote_repository(args.git_command, external_url):
+        external_url = resolve_external_url("https://projects.blender.org/blender/blender", repo_name)
+
+    call((args.git_command, "clone", "--origin", origin_name, external_url, external_dir))
+
+
+def external_script_add_origin_if_needed(args: argparse.Namespace,
+                                         repo_name: str,
+                                         directory_name: str) -> str:
+    """
+    Add remote called 'origin' if there is a fork of the external repository available
+
+    This is only done when using Github style upstream workflow in the main repository.
+    """
+
+    if not use_upstream_workflow(args):
+        return ""
+
+    cwd = os.getcwd()
+
+    blender_git_root = Path(get_blender_git_root())
+    scripts_dir = blender_git_root / "scripts"
+    external_dir = scripts_dir / directory_name
+
+    origin_blender_url = make_utils.git_get_remote_url(args.git_command, "origin")
+    origin_external_url = resolve_external_url(origin_blender_url, repo_name)
+
+    try:
+        os.chdir(external_dir)
+
+        if (make_utils.git_remote_exist(args.git_command, "origin") or
+                not make_utils.git_remote_exist(args.git_command, "upstream")):
+            return
+
+        if not make_utils.git_is_remote_repository(args.git_command, origin_external_url):
+            return
+
+        print(f"Adding origin remote to {directory_name} pointing to fork ...")
+
+        # Non-obvious tricks to introduce the new remote called "origin" to the existing
+        # submodule configuration.
+        #
+        # This is all within the content of creating a fork of a submodule after `make update`
+        # has been run and possibly local branches tracking upstream were added.
+        #
+        # The idea here goes as following:
+        #
+        #  - Rename remote "upstream" to "origin", which takes care of changing the names of
+        #    remotes the local branches are tracking.
+        #
+        #  - Change the URL to the "origin", which so was was still pointing to upstream.
+        #
+        #  - Re-introduce the "upstream" remote, with the same URL as it had prior to rename.
+
+        upstream_url = make_utils.git_get_remote_url(args.git_command, "upstream")
+
+        call((args.git_command, "remote", "rename", "upstream", "origin"))
+        make_utils.git_set_config(args.git_command, f"remote.origin.url", origin_external_url)
+
+        call((args.git_command, "remote", "add", "upstream", upstream_url))
+    finally:
+        os.chdir(cwd)
+
+    return ""
+
+
+def external_scripts_update(args: argparse.Namespace,
+                            repo_name: str,
+                            directory_name: str,
+                            branch: Optional[str]) -> str:
+    """Update a single external checkout with the given name in the scripts folder"""
+
+    external_script_initialize_if_needed(args, repo_name, directory_name)
+    external_script_add_origin_if_needed(args, repo_name, directory_name)
+
+    print(f"Updating scripts/{directory_name} ...")
+
+    cwd = os.getcwd()
+
+    blender_git_root = Path(get_blender_git_root())
+    scripts_dir = blender_git_root / "scripts"
+    external_dir = scripts_dir / directory_name
+
+    # Update externals to appropriate given branch, falling back to main if none is given and/or
+    # found in a sub-repository.
+    branch_fallback = "main"
    if not branch:
        branch = branch_fallback

-    submodules = [
-        ("release/scripts/addons", branch, branch_fallback),
-        ("release/scripts/addons_contrib", branch, branch_fallback),
-        ("release/datafiles/locale", branch, branch_fallback),
-        ("source/tools", branch, branch_fallback),
-    ]
-
-    # Initialize submodules only if needed.
-    for submodule_path, submodule_branch, submodule_branch_fallback in submodules:
-        if not os.path.exists(os.path.join(submodule_path, ".git")):
-            call([args.git_command, "submodule", "update", "--init", "--recursive"])
-            break
-
-    # Checkout appropriate branch and pull changes.
    skip_msg = ""
-    for submodule_path, submodule_branch, submodule_branch_fallback in submodules:
-        cwd = os.getcwd()
-        try:
-            os.chdir(submodule_path)
-            msg = git_update_skip(args, check_remote_exists=False)
-            if msg:
-                skip_msg += submodule_path + " skipped: " + msg + "\n"
-            else:
-                # Find a matching branch that exists.
-                call([args.git_command, "fetch", "origin"])
-                if make_utils.git_branch_exists(args.git_command, submodule_branch):
-                    pass
-                elif make_utils.git_branch_exists(args.git_command, submodule_branch_fallback):
-                    submodule_branch = submodule_branch_fallback
-                else:
-                    # Skip.
-                    submodule_branch = ""

-                # Switch to branch and pull.
-                if submodule_branch:
-                    if make_utils.git_branch(args.git_command) != submodule_branch:
+    try:
+        os.chdir(external_dir)
+        msg = git_update_skip(args, check_remote_exists=False)
+        if msg:
+            skip_msg += directory_name + " skipped: " + msg + "\n"
+        else:
+            # Find a matching branch that exists.
+            for remote in ("origin", "upstream"):
+                if make_utils.git_remote_exist(args.git_command, remote):
+                    call([args.git_command, "fetch", remote])
+
+            submodule_branch = branch
+
+            if make_utils.git_branch_exists(args.git_command, submodule_branch):
+                pass
+            elif make_utils.git_branch_exists(args.git_command, branch_fallback):
+                submodule_branch = branch_fallback
+            else:
+                # Skip.
+                submodule_branch = ""
+
+            # Switch to branch and pull.
+            if submodule_branch:
+                if make_utils.git_branch(args.git_command) != submodule_branch:
+                    # If the local branch exists just check out to it.
+                    # If there is no local branch but only remote specify an explicit remote.
+                    # Without this explicit specification Git attempts to set-up tracking
+                    # automatically and fails when the branch is available in multiple remotes.
+                    if make_utils.git_local_branch_exists(args.git_command, submodule_branch):
                        call([args.git_command, "checkout", submodule_branch])
-                    call([args.git_command, "pull", "--rebase", "origin", submodule_branch])
-        finally:
-            os.chdir(cwd)
+                    elif make_utils.git_remote_exist(args.git_command, "origin"):
+                        call([args.git_command, "checkout", "-t", f"origin/{submodule_branch}"])
+                    elif make_utils.git_remote_exist(args.git_command, "upstream"):
+                        call([args.git_command, "checkout", "-t", f"upstream/{submodule_branch}"])
+                # Don't use extra fetch since all remotes of interest have been already fetched
+                # some lines above.
+                skip_msg += work_tree_update(args, use_fetch=False)
+    finally:
+        os.chdir(cwd)

    return skip_msg


+def scripts_submodules_update(args: argparse.Namespace, branch: Optional[str]) -> str:
+    """Update working trees of addons and addons_contrib within the scripts/ directory"""
+    msg = ""
+
+    msg += external_scripts_update(args, "blender-addons", "addons", branch)
+    msg += external_scripts_update(args, "blender-addons-contrib", "addons_contrib", branch)
+
+    return msg
+
+
+def submodules_update(args: argparse.Namespace, branch: Optional[str]) -> str:
+    """Update submodules or other externally tracked source trees"""
+    msg = ""
+
+    msg += scripts_submodules_update(args, branch)
+
+    return msg
+
+
 if __name__ == "__main__":
    args = parse_arguments()
    blender_skip_msg = ""
    submodules_skip_msg = ""

-    # Test if we are building a specific release version.
-    branch = make_utils.git_branch(args.git_command)
-    if branch == 'HEAD':
-        sys.stderr.write('Blender git repository is in detached HEAD state, must be in a branch\n')
-        sys.exit(1)
-
-    tag = make_utils.git_tag(args.git_command)
-    release_version = make_utils.git_branch_release_version(branch, tag)
+    blender_version = make_utils. parse_blender_version()
+    if blender_version.cycle != 'alpha':
+        major = blender_version.version // 100
+        minor = blender_version.version % 100
+        branch = f"blender-v{major}.{minor}-release"
+        release_version = f"{major}.{minor}"
+    else:
+        branch = 'main'
+        release_version = None

    if not args.no_libraries:
        svn_update(args, release_version)
    if not args.no_blender:
        blender_skip_msg = git_update_skip(args)
+        if not blender_skip_msg:
+            blender_skip_msg = blender_update(args)
        if blender_skip_msg:
            blender_skip_msg = "Blender repository skipped: " + blender_skip_msg + "\n"
-        else:
-            blender_update(args)
    if not args.no_submodules:
-        submodules_skip_msg = submodules_update(args, release_version, branch)
+        submodules_skip_msg = submodules_update(args, branch)

    # Report any skipped repositories at the end, so it's not as easy to miss.
    skip_msg = blender_skip_msg + submodules_skip_msg
--- a/build_files/utils/make_utils.py
+++ b/build_files/utils/make_utils.py
@@ -9,7 +9,9 @@ import re
 import shutil
 import subprocess
 import sys
+import os
 from pathlib import Path
+from urllib.parse import urljoin

 from typing import (
    Sequence,
@@ -19,7 +21,7 @@ from typing import (

 def call(cmd: Sequence[str], exit_on_error: bool = True, silent: bool = False) -> int:
    if not silent:
-        print(" ".join(cmd))
+        print(" ".join([str(x) for x in cmd]))

    # Flush to ensure correct order output on Windows.
    sys.stdout.flush()
@@ -52,13 +54,57 @@ def check_output(cmd: Sequence[str], exit_on_error: bool = True) -> str:
    return output.strip()


+def git_local_branch_exists(git_command: str, branch: str) -> bool:
+    return (
+        call([git_command, "rev-parse", "--verify", branch], exit_on_error=False, silent=True) == 0
+    )
+
+
 def git_branch_exists(git_command: str, branch: str) -> bool:
    return (
-        call([git_command, "rev-parse", "--verify", branch], exit_on_error=False, silent=True) == 0 or
+        git_local_branch_exists(git_command, branch) or
+        call([git_command, "rev-parse", "--verify", "remotes/upstream/" + branch], exit_on_error=False, silent=True) == 0 or
        call([git_command, "rev-parse", "--verify", "remotes/origin/" + branch], exit_on_error=False, silent=True) == 0
    )


+def git_get_remote_url(git_command: str, remote_name: str) -> bool:
+    return check_output((git_command, "ls-remote", "--get-url", remote_name))
+
+
+def git_remote_exist(git_command: str, remote_name: str) -> bool:
+    """Check whether there is a remote with the given name"""
+    # `git ls-remote --get-url upstream` will print an URL if there is such remote configured, and
+    # otherwise will print "upstream".
+    remote_url = check_output((git_command, "ls-remote", "--get-url", remote_name))
+    return remote_url != remote_name
+
+
+def git_get_resolved_submodule_url(git_command: str, blender_url: str, submodule_path: str) -> str:
+    git_root = check_output([git_command, "rev-parse", "--show-toplevel"])
+    dot_gitmodules = os.path.join(git_root, ".gitmodules")
+
+    submodule_key_prefix = f"submodule.{submodule_path}"
+    submodule_key_url = f"{submodule_key_prefix}.url"
+
+    gitmodule_url = git_get_config(
+        git_command, submodule_key_url, file=dot_gitmodules)
+
+    # A bit of a trickery to construct final URL.
+    # Only works for the relative submodule URLs.
+    #
+    # Note that unless the LHS URL ends up with a slash urljoin treats the last component as a
+    # file.
+    assert gitmodule_url.startswith('..')
+    return urljoin(blender_url + "/", gitmodule_url)
+
+
+def git_is_remote_repository(git_command: str, repo: str) -> bool:
+    """Returns true if the given repository is a valid/clonable git repo"""
+    exit_code = call((git_command, "ls-remote", repo, "HEAD"), exit_on_error=False, silent=True)
+    return exit_code == 0
+
+
 def git_branch(git_command: str) -> str:
    # Get current branch name.
    try:
@@ -70,6 +116,20 @@ def git_branch(git_command: str) -> str:
    return branch.strip().decode('utf8')


+def git_get_config(git_command: str, key: str, file: Optional[str] = None) -> str:
+    if file:
+        return check_output([git_command, "config", "--file", file, "--get", key])
+
+    return check_output([git_command, "config", "--get", key])
+
+
+def git_set_config(git_command: str, key: str, value: str, file: Optional[str] = None) -> str:
+    if file:
+        return check_output([git_command, "config", "--file", file, key, value])
+
+    return check_output([git_command, "config", key, value])
+
+
 def git_tag(git_command: str) -> Optional[str]:
    # Get current tag name.
    try:
--- a/build_files/windows/check_submodules.cmd
+++ b/build_files/windows/check_submodules.cmd
@@ -1,20 +0,0 @@
-if NOT exist "%BLENDER_DIR%\source\tools\.git" (
-	echo Checking out sub-modules 
-	if not "%GIT%" == "" (
-		"%GIT%" submodule update --init --recursive --progress
-		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git checkout master
-		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git pull --rebase origin master
-		if errorlevel 1 goto FAIL
-		goto EOF
-	) else (
-		echo Blender submodules not found, and git not found in path to retrieve them.
-		goto FAIL
-	)
-)
-goto EOF
-
-:FAIL
-exit /b 1
-:EOF
--- a/build_files/windows/format.cmd
+++ b/build_files/windows/format.cmd
@@ -14,7 +14,7 @@ if NOT EXIST %PYTHON% (
    exit /b 1
 )

-set FORMAT_PATHS=%BLENDER_DIR%\source\tools\utils_maintenance\clang_format_paths.py
+set FORMAT_PATHS=%BLENDER_DIR%\tools\utils_maintenance\clang_format_paths.py

 REM The formatting script expects clang-format to be in the current PATH.
 set PATH=%CF_PATH%;%PATH%
--- a/build_files/windows/show_hashes.cmd
+++ b/build_files/windows/show_hashes.cmd
@@ -4,9 +4,9 @@ if "%GIT%" == "" (
 )
 cd "%BLENDER_DIR%"
 for /f "delims=" %%i in ('"%GIT%" rev-parse HEAD') do echo Branch_hash=%%i
-cd "%BLENDER_DIR%/release/datafiles/locale"
+cd "%BLENDER_DIR%/locale"
 for /f "delims=" %%i in ('"%GIT%" rev-parse HEAD') do echo Locale_hash=%%i
-cd "%BLENDER_DIR%/release/scripts/addons"
+cd "%BLENDER_DIR%/scripts/addons"
 for /f "delims=" %%i in ('"%GIT%" rev-parse HEAD') do echo Addons_Hash=%%i
 cd "%BLENDER_DIR%"
 :EOF
--- a/doc/python_api/examples/blf.py
+++ b/doc/python_api/examples/blf.py
@@ -37,7 +37,7 @@ def draw_callback_px(self, context):
    # BLF drawing routine
    font_id = font_info["font_id"]
    blf.position(font_id, 2, 80, 0)
-    blf.size(font_id, 50, 72)
+    blf.size(font_id, 50)
    blf.draw(font_id, "Hello World")


--- a/doc/python_api/rst/include__bmesh.rst
+++ b/doc/python_api/rst/include__bmesh.rst
@@ -31,7 +31,7 @@ For an overview of BMesh data types and how they reference each other see:
 Example Script
 --------------

-.. literalinclude:: __/__/__/release/scripts/templates_py/bmesh_simple.py
+.. literalinclude:: __/__/__/scripts/templates_py/bmesh_simple.py


 Standalone Module
--- a/doc/python_api/rst/info_quickstart.rst
+++ b/doc/python_api/rst/info_quickstart.rst
@@ -288,7 +288,7 @@ In Python, this is done by defining a class, which is a subclass of an existing
 Example Operator
 ----------------

-.. literalinclude:: __/__/__/release/scripts/templates_py/operator_simple.py
+.. literalinclude:: __/__/__/scripts/templates_py/operator_simple.py

 Once this script runs, ``SimpleOperator`` is registered with Blender
 and can be called from Operator Search or added to the toolbar.
@@ -320,7 +320,7 @@ Example Panel
 Panels are registered as a class, like an operator.
 Notice the extra ``bl_`` variables used to set the context they display in.

-.. literalinclude:: __/__/__/release/scripts/templates_py/ui_panel_simple.py
+.. literalinclude:: __/__/__/scripts/templates_py/ui_panel_simple.py

 To run the script:

--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -367,13 +367,13 @@ except ImportError:
 # Note that ".." is replaced by "__" in the RST files,
 # to avoid having to match Blender's source tree.
 EXTRA_SOURCE_FILES = (
-    "../../../release/scripts/templates_py/bmesh_simple.py",
-    "../../../release/scripts/templates_py/gizmo_operator.py",
-    "../../../release/scripts/templates_py/gizmo_operator_target.py",
-    "../../../release/scripts/templates_py/gizmo_simple.py",
-    "../../../release/scripts/templates_py/operator_simple.py",
-    "../../../release/scripts/templates_py/ui_panel_simple.py",
-    "../../../release/scripts/templates_py/ui_previews_custom_icon.py",
+    "../../../scripts/templates_py/bmesh_simple.py",
+    "../../../scripts/templates_py/gizmo_operator.py",
+    "../../../scripts/templates_py/gizmo_operator_target.py",
+    "../../../scripts/templates_py/gizmo_simple.py",
+    "../../../scripts/templates_py/operator_simple.py",
+    "../../../scripts/templates_py/ui_panel_simple.py",
+    "../../../scripts/templates_py/ui_previews_custom_icon.py",
    "../examples/bmesh.ops.1.py",
    "../examples/bpy.app.translations.py",
 )
@@ -476,7 +476,7 @@ MODULE_GROUPING = {

 # -------------------------------BLENDER----------------------------------------

-# converting bytes to strings, due to T30154
+# Converting bytes to strings, due to #30154.
 BLENDER_REVISION = str(bpy.app.build_hash, 'utf_8')
 BLENDER_REVISION_TIMESTAMP = bpy.app.build_commit_timestamp

@@ -487,7 +487,7 @@ BLENDER_VERSION_DOTS = "%d.%d" % (bpy.app.version[0], bpy.app.version[1])
 if BLENDER_REVISION != "Unknown":
    # SHA1 Git hash
    BLENDER_VERSION_HASH = BLENDER_REVISION
-    BLENDER_VERSION_HASH_HTML_LINK = "<a href=https://developer.blender.org/rB%s>%s</a>" % (
+    BLENDER_VERSION_HASH_HTML_LINK = "<a href=https://projects.blender.org/blender/blender/commit/%s>%s</a>" % (
        BLENDER_VERSION_HASH, BLENDER_VERSION_HASH,
    )
    BLENDER_VERSION_DATE = time.strftime("%d/%m/%Y", time.localtime(BLENDER_REVISION_TIMESTAMP))
@@ -647,7 +647,7 @@ def undocumented_message(module_name, type_name, identifier):
        module_name, type_name, identifier,
    )

-    return "Undocumented, consider `contributing <https://developer.blender.org/T51061>`__."
+    return "Undocumented, consider `contributing <https://developer.blender.org/>`__."


 def range_str(val):
@@ -1816,9 +1816,9 @@ def pyrna2sphinx(basepath):

    # operators
    def write_ops():
-        API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts"
-        API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA"
-        API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC"
+        API_BASEURL = "https://projects.blender.org/blender/blender/src/branch/main/scripts"
+        API_BASEURL_ADDON = "https://projects.blender.org/blender/blender-addons"
+        API_BASEURL_ADDON_CONTRIB = "https://projects.blender.org/blender/blender-addons-contrib"

        op_modules = {}
        op = None
@@ -1853,8 +1853,6 @@ def pyrna2sphinx(basepath):
                fw("   %s\n\n" % operator_description)
                for prop in op.args:
                    write_param("   ", fw, prop)
-                if op.args:
-                    fw("\n")

                location = op.get_location()
                if location != (None, None):
@@ -1865,9 +1863,12 @@ def pyrna2sphinx(basepath):
                    else:
                        url_base = API_BASEURL

-                    fw("   :file: `%s\\:%d <%s/%s$%d>`_\n\n" %
+                    fw("   :File: `%s\\:%d <%s/%s#L%d>`__\n\n" %
                       (location[0], location[1], url_base, location[0], location[1]))

+                if op.args:
+                    fw("\n")
+
            file.close()

    if "bpy.ops" not in EXCLUDE_MODULES:
@@ -2200,7 +2201,7 @@ def write_rst_enum_items(basepath, key, key_no_prefix, enum_items):
    Write a single page for a static enum in RST.

    This helps avoiding very large lists being in-lined in many places which is an issue
-    especially with icons in ``bpy.types.UILayout``. See T87008.
+    especially with icons in ``bpy.types.UILayout``. See #87008.
    """
    filepath = os.path.join(basepath, "%s.rst" % key_no_prefix)
    with open(filepath, "w", encoding="utf-8") as fh:
--- a/doc/python_api/static/js/version_switch.js
+++ b/doc/python_api/static/js/version_switch.js
@@ -156,7 +156,7 @@ var Popover = function() {
    },
    getNamed : function(v) {
      $.each(all_versions, function(ix, title) {
-        if (ix === "master" || ix === "latest") {
+        if (ix === "master" || ix === "main" || ix === "latest") {
          var m = title.match(/\d\.\d[\w\d\.]*/)[0];
          if (parseFloat(m) == v) {
            v = ix;
--- a/extern/cuew/include/cuew.h
+++ b/extern/cuew/include/cuew.h
@@ -127,7 +127,7 @@ typedef uint32_t cuuint32_t;
 typedef uint64_t cuuint64_t;
 #endif

-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__)
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__) || defined(__ppc64__) || defined(__PPC64__)
 typedef unsigned long long CUdeviceptr;
 #else
 typedef unsigned int CUdeviceptr;
--- a/extern/hipew/README.blender
+++ b/extern/hipew/README.blender
@@ -1,5 +1,5 @@
 Project: Blender
-URL: https://git.blender.org/blender.git
+URL: https://projects.blender.org/blender/blender.git
 License: Apache 2.0
 Upstream version: N/A
 Local modifications: None
--- a/extern/hipew/include/hipew.h
+++ b/extern/hipew/include/hipew.h
@@ -84,7 +84,7 @@ typedef uint32_t hipuint32_t;
 typedef uint64_t hipuint64_t;
 #endif

-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__)
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__) || defined(__ppc64__) || defined(__PPC64__)
 typedef unsigned long long hipDeviceptr_t;
 #else
 typedef unsigned int hipDeviceptr_t;
--- a/extern/quadriflow/patches/blender.patch
+++ b/extern/quadriflow/patches/blender.patch
@@ -231,3 +231,22 @@ index 355ee008246..a770bbee60c 100644
         }
         allocator.deallocate(values, capacity);
         capacity = 0;
+diff --git a/extern/quadriflow/src/hierarchy.cpp b/extern/quadriflow/src/hierarchy.cpp
+index 8cc41da23d0..70a9628320f 100644
+--- a/extern/quadriflow/src/hierarchy.cpp
+++ b/extern/quadriflow/src/hierarchy.cpp
+@@ -269,7 +269,13 @@ void Hierarchy::DownsampleGraph(const AdjacentMatrix adj, const MatrixXd& V, con
+         for (auto it = ad.begin(); it != ad.end(); ++it, ++entry_it) {
+             int k = it->id;
+             double dp = N.col(i).dot(N.col(k));
+-            double ratio = A[i] > A[k] ? (A[i] / A[k]) : (A[k] / A[i]);
+            double ratio;
+            if (A[i] > A[k]) {
+                ratio = (A[k] == 0.0f) ? 1.0f : A[i] / A[k];
+            }
+            else {
+                ratio = (A[i] == 0.0f) ? 1.0f : A[k] / A[i];
+            }
+             *entry_it = Entry(i, k, dp * ratio);
+         }
+     }
--- a/extern/quadriflow/src/hierarchy.cpp
+++ b/extern/quadriflow/src/hierarchy.cpp
@@ -269,7 +269,13 @@ void Hierarchy::DownsampleGraph(const AdjacentMatrix adj, const MatrixXd& V, con
        for (auto it = ad.begin(); it != ad.end(); ++it, ++entry_it) {
            int k = it->id;
            double dp = N.col(i).dot(N.col(k));
-            double ratio = A[i] > A[k] ? (A[i] / A[k]) : (A[k] / A[i]);
+            double ratio;
+            if (A[i] > A[k]) {
+                ratio = (A[k] == 0.0f) ? 1.0f : A[i] / A[k];
+            }
+            else {
+                ratio = (A[i] == 0.0f) ? 1.0f : A[k] / A[i];
+            }
            *entry_it = Entry(i, k, dp * ratio);
        }
    }
--- a/extern/sdlew/CMakeLists.txt
+++ b/extern/sdlew/CMakeLists.txt
@@ -7,7 +7,7 @@ set(INC
 )

 set(INC_SYS
-
+ ${X11_X11_INCLUDE_PATH}
 )

 set(SRC
--- a/extern/tinygltf/README.blender
+++ b/extern/tinygltf/README.blender
@@ -1,6 +1,5 @@
 Project: TinyGLTF
 URL: https://github.com/syoyo/tinygltf
 License: MIT
-Upstream version: 2.5.0, 19a41d20ec0
-Local modifications: 
-* Silence "enum value not handled in switch" warnings due to JSON dependency.
+Upstream version: 2.8.3, 84a83d39f55d
+Local modifications: None
--- a/extern/tinygltf/patches/TinyGLTF.diff
+++ b/extern/tinygltf/patches/TinyGLTF.diff
--- a/extern/tinygltf/tiny_gltf.h
+++ b/extern/tinygltf/tiny_gltf.h
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -12,6 +12,7 @@ from bpy.props import (
    PointerProperty,
    StringProperty,
 )
+from bpy.app.translations import pgettext_iface as iface_

 from math import pi

@@ -1664,30 +1665,48 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            col.label(text="No compatible GPUs found for Cycles", icon='INFO')

            if device_type == 'CUDA':
-                col.label(text="Requires NVIDIA GPU with compute capability 3.0", icon='BLANK1')
+                compute_capability = "3.0"
+                col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
+                          icon='BLANK1', translate=False)
            elif device_type == 'OPTIX':
-                col.label(text="Requires NVIDIA GPU with compute capability 5.0", icon='BLANK1')
-                col.label(text="and NVIDIA driver version 470 or newer", icon='BLANK1')
+                compute_capability = "5.0"
+                driver_version = "470"
+                col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
+                          icon='BLANK1', translate=False)
+                col.label(text=iface_("and NVIDIA driver version %s or newer") % driver_version,
+                          icon='BLANK1', translate=False)
            elif device_type == 'HIP':
                import sys
                if sys.platform[:3] == "win":
-                    col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
-                    col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
+                    driver_version = "21.Q4"
+                    col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
+                    col.label(text=iface_("and AMD Radeon Pro %s driver or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                elif sys.platform.startswith("linux"):
-                    col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
-                    col.label(text="and AMD driver version 22.10 or newer", icon='BLANK1')
+                    driver_version = "22.10"
+                    col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
+                    col.label(text=iface_("and AMD driver version %s or newer") % driver_version, icon='BLANK1',
+                              translate=False)
            elif device_type == 'ONEAPI':
                import sys
                if sys.platform.startswith("win"):
+                    driver_version = "101.4032"
                    col.label(text="Requires Intel GPU with Xe-HPG architecture", icon='BLANK1')
-                    col.label(text="and Windows driver version 101.4032 or newer", icon='BLANK1')
+                    col.label(text=iface_("and Windows driver version %s or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                elif sys.platform.startswith("linux"):
+                    driver_version = "1.3.24931"
                    col.label(text="Requires Intel GPU with Xe-HPG architecture and", icon='BLANK1')
-                    col.label(text="  - intel-level-zero-gpu version 1.3.24931 or newer", icon='BLANK1')
+                    col.label(text=iface_("  - intel-level-zero-gpu version %s or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                    col.label(text="  - oneAPI Level-Zero Loader", icon='BLANK1')
            elif device_type == 'METAL':
-                col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
-                col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1')
+                silicon_mac_version = "12.2"
+                amd_mac_version = "12.3"
+                col.label(text=iface_("Requires Apple Silicon with macOS %s or newer") % silicon_mac_version,
+                          icon='BLANK1', translate=False)
+                col.label(text=iface_("or AMD with macOS %s or newer") % amd_mac_version, icon='BLANK1',
+                          translate=False)
            return

        for device in devices:
@@ -1697,7 +1716,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                .replace('(TM)', unicodedata.lookup('TRADE MARK SIGN'))
                .replace('(tm)', unicodedata.lookup('TRADE MARK SIGN'))
                .replace('(R)', unicodedata.lookup('REGISTERED SIGN'))
-                .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN'))
+                .replace('(C)', unicodedata.lookup('COPYRIGHT SIGN')),
+                translate=False
            )

    def draw_impl(self, layout, context):
@@ -1722,19 +1742,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            row.prop(self, "peer_memory")

        if compute_device_type == 'METAL':
-            import platform, re
-            isNavi2 = False
+            import platform
+            import re
+            is_navi_2 = False
            for device in devices:
-                obj = re.search("((RX)|(Pro)|(PRO))\s+W?6\d00X",device.name)
-                if obj:
-                    isNavi2 = True
+                if re.search(r"((RX)|(Pro)|(PRO))\s+W?6\d00X", device.name):
+                    is_navi_2 = True
+                    break

-            # MetalRT only works on Apple Silicon and Navi2
-            if platform.machine() == 'arm64' or isNavi2:
+            # MetalRT only works on Apple Silicon and Navi2.
+            is_arm64 = platform.machine() == 'arm64'
+            if is_arm64 or is_navi_2:
                col = layout.column()
                col.use_property_split = True
                # Kernel specialization is only supported on Apple Silicon
-                if platform.machine() == 'arm64':
+                if is_arm64:
                    col.prop(self, "kernel_optimization_level")
                col.prop(self, "use_metalrt")

--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -20,7 +20,7 @@ class CyclesPresetPanel(PresetPanel, Panel):
    @staticmethod
    def post_cb(context):
        # Modify an arbitrary built-in scene property to force a depsgraph
-        # update, because add-on properties don't. (see T62325)
+        # update, because add-on properties don't. (see #62325)
        render = context.scene.render
        render.filter_size = render.filter_size

--- a/intern/cycles/blender/display_driver.cpp
+++ b/intern/cycles/blender/display_driver.cpp
@@ -54,44 +54,10 @@ int BlenderDisplayShader::get_tex_coord_attrib_location()
 /* --------------------------------------------------------------------
 * BlenderFallbackDisplayShader.
 */
-
-/* TODO move shaders to standalone .glsl file. */
-static const char *FALLBACK_VERTEX_SHADER =
-    "uniform vec2 fullscreen;\n"
-    "in vec2 texCoord;\n"
-    "in vec2 pos;\n"
-    "out vec2 texCoord_interp;\n"
-    "\n"
-    "vec2 normalize_coordinates()\n"
-    "{\n"
-    "   return (vec2(2.0) * (pos / fullscreen)) - vec2(1.0);\n"
-    "}\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   gl_Position = vec4(normalize_coordinates(), 0.0, 1.0);\n"
-    "   texCoord_interp = texCoord;\n"
-    "}\n\0";
-
-static const char *FALLBACK_FRAGMENT_SHADER =
-    "uniform sampler2D image_texture;\n"
-    "in vec2 texCoord_interp;\n"
-    "out vec4 fragColor;\n"
-    "\n"
-    "void main()\n"
-    "{\n"
-    "   fragColor = texture(image_texture, texCoord_interp);\n"
-    "}\n\0";
-
 static GPUShader *compile_fallback_shader(void)
 {
  /* NOTE: Compilation errors are logged to console. */
-  GPUShader *shader = GPU_shader_create(FALLBACK_VERTEX_SHADER,
-                                        FALLBACK_FRAGMENT_SHADER,
-                                        nullptr,
-                                        nullptr,
-                                        nullptr,
-                                        "FallbackCyclesBlitShader");
+  GPUShader *shader = GPU_shader_create_from_info_name("gpu_shader_cycles_display_fallback");
  return shader;
 }

@@ -105,11 +71,12 @@ GPUShader *BlenderFallbackDisplayShader::bind(int width, int height)

  /* Bind shader now to enable uniform assignment. */
  GPU_shader_bind(shader_program_);
-  GPU_shader_uniform_int(shader_program_, image_texture_location_, 0);
+  int slot = 0;
+  GPU_shader_uniform_int_ex(shader_program_, image_texture_location_, 1, 1, &slot);
  float size[2];
  size[0] = width;
  size[1] = height;
-  GPU_shader_uniform_vector(shader_program_, fullscreen_location_, 2, 1, size);
+  GPU_shader_uniform_float_ex(shader_program_, fullscreen_location_, 2, 1, size);
  return shader_program_;
 }

--- a/intern/cycles/blender/image.cpp
+++ b/intern/cycles/blender/image.cpp
@@ -20,7 +20,7 @@ BlenderImageLoader::BlenderImageLoader(BL::Image b_image,
    : b_image(b_image),
      frame(frame),
      tile_number(tile_number),
-      /* Don't free cache for preview render to avoid race condition from T93560, to be fixed
+      /* Don't free cache for preview render to avoid race condition from #93560, to be fixed
       * properly later as we are close to release. */
      free_cache(!is_preview_render && !b_image.has_data())
 {
@@ -72,7 +72,7 @@ bool BlenderImageLoader::load_metadata(const ImageDeviceFeatures &, ImageMetaDat
    metadata.colorspace = u_colorspace_raw;
  }
  else {
-    /* In some cases (e.g. T94135), the colorspace setting in Blender gets updated as part of the
+    /* In some cases (e.g. #94135), the colorspace setting in Blender gets updated as part of the
     * metadata queries in this function, so update the colorspace setting here. */
    PointerRNA colorspace_ptr = b_image.colorspace_settings().ptr;
    metadata.colorspace = get_enum_identifier(colorspace_ptr, "name");
--- a/intern/cycles/blender/light.cpp
+++ b/intern/cycles/blender/light.cpp
@@ -24,7 +24,7 @@ void BlenderSync::sync_light(BL::Object &b_parent,
  Light *light = light_map.find(key);

  /* Check if the transform was modified, in case a linked collection is moved we do not get a
-   * specific depsgraph update (T88515). This also mimics the behavior for Objects. */
+   * specific depsgraph update (#88515). This also mimics the behavior for Objects. */
  const bool tfm_updated = (light && light->get_tfm() != tfm);

  /* Update if either object or light data changed. */
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -94,7 +94,7 @@ void python_thread_state_restore(void **python_thread_state)
  *python_thread_state = NULL;
 }

-static const char *PyC_UnicodeAsByte(PyObject *py_str, PyObject **coerce)
+static const char *PyC_UnicodeAsBytes(PyObject *py_str, PyObject **coerce)
 {
  const char *result = PyUnicode_AsUTF8(py_str);
  if (result) {
@@ -131,8 +131,8 @@ static PyObject *init_func(PyObject * /*self*/, PyObject *args)
  }

  PyObject *path_coerce = nullptr, *user_path_coerce = nullptr;
-  path_init(PyC_UnicodeAsByte(path, &path_coerce),
-            PyC_UnicodeAsByte(user_path, &user_path_coerce));
+  path_init(PyC_UnicodeAsBytes(path, &path_coerce),
+            PyC_UnicodeAsBytes(user_path, &user_path_coerce));
  Py_XDECREF(path_coerce);
  Py_XDECREF(user_path_coerce);

--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -404,7 +404,7 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
     * point we know that we've got everything to render current view layer.
     */
    /* At the moment we only free if we are not doing multi-view
-     * (or if we are rendering the last view). See T58142/D4239 for discussion.
+     * (or if we are rendering the last view). See #58142/D4239 for discussion.
     */
    if (view_index == num_views - 1) {
      free_blender_memory_if_possible();
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -981,22 +981,8 @@ static ShaderNode *add_node(Scene *scene,
    sky->set_sun_disc(b_sky_node.sun_disc());
    sky->set_sun_size(b_sky_node.sun_size());
    sky->set_sun_intensity(b_sky_node.sun_intensity());
-    /* Patch sun position to be able to animate daylight cycle while keeping the shading code
-     * simple. */
-    float sun_rotation = b_sky_node.sun_rotation();
-    /* Wrap into [-2PI..2PI] range. */
-    float sun_elevation = fmodf(b_sky_node.sun_elevation(), M_2PI_F);
-    /* Wrap into [-PI..PI] range. */
-    if (fabsf(sun_elevation) >= M_PI_F) {
-      sun_elevation -= copysignf(2.0f, sun_elevation) * M_PI_F;
-    }
-    /* Wrap into [-PI/2..PI/2] range while keeping the same absolute position. */
-    if (sun_elevation >= M_PI_2_F || sun_elevation <= -M_PI_2_F) {
-      sun_elevation = copysignf(M_PI_F, sun_elevation) - sun_elevation;
-      sun_rotation += M_PI_F;
-    }
-    sky->set_sun_elevation(sun_elevation);
-    sky->set_sun_rotation(sun_rotation);
+    sky->set_sun_elevation(b_sky_node.sun_elevation());
+    sky->set_sun_rotation(b_sky_node.sun_rotation());
    sky->set_altitude(b_sky_node.altitude());
    sky->set_air_density(b_sky_node.air_density());
    sky->set_dust_density(b_sky_node.dust_density());
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -349,8 +349,7 @@ void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)

  bool use_light_tree = get_boolean(cscene, "use_light_tree");
  integrator->set_use_light_tree(use_light_tree);
-  integrator->set_light_sampling_threshold(
-      (use_light_tree) ? 0.0f : get_float(cscene, "light_sampling_threshold"));
+  integrator->set_light_sampling_threshold(get_float(cscene, "light_sampling_threshold"));

  if (integrator->use_light_tree_is_modified()) {
    scene->light_manager->tag_update(scene, LightManager::UPDATE_ALL);
@@ -766,7 +765,7 @@ void BlenderSync::free_data_after_sync(BL::Depsgraph &b_depsgraph)
      (BlenderSession::headless || is_interface_locked) &&
      /* Baking re-uses the depsgraph multiple times, clearing crashes
       * reading un-evaluated mesh data which isn't aligned with the
-       * geometry we're baking, see T71012. */
+       * geometry we're baking, see #71012. */
      !scene->bake_manager->get_baking() &&
      /* Persistent data must main caches for performance and correctness. */
      !is_persistent_data;
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error)
 }

 CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
+  static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
+
  first_error = true;

  cuDevId = info.num;
@@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize CUDA. */
@@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  int value;
+  cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  can_map_host = value != 0;

  cuda_assert(cuDeviceGetAttribute(
      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@@ -499,311 +498,56 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
 {
  CUDAContextScope scope(this);

-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void CUDADevice::generic_copy_to(device_memory &mem)
+bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  CUDAContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
+  return mem_alloc_result == CUDA_SUCCESS;
 }

-void CUDADevice::generic_free(device_memory &mem)
+void CUDADevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+  CUDAContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
+bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  CUDAContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  CUresult mem_alloc_result = cuMemHostAlloc(
+      &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+  return mem_alloc_result == CUDA_SUCCESS;
+}

-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
+void CUDADevice::free_host(void *shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuMemFreeHost(shared_pointer);
+}
+
+void CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
+}
+
+void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
 }

 void CUDADevice::mem_alloc(device_memory &mem)
@@ -868,8 +612,8 @@ void CUDADevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const CUDAContextScope scope(this);
    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
  }
@@ -994,19 +738,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
      return;
  }

-  CUDAMem *cmem = NULL;
+  Mem *cmem = NULL;
  CUarray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -1050,10 +794,10 @@ void CUDADevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@@ -1137,8 +881,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@@ -1153,9 +897,9 @@ void CUDADevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@@ -1164,16 +908,16 @@ void CUDADevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      cuArrayDestroy(cmem.array);
+      cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class CUDADevice : public Device {
+class CUDADevice : public GPUDevice {

  friend class CUDAContextScope;

@@ -29,36 +29,11 @@ class CUDADevice : public Device {
  CUdevice cuDevice;
  CUcontext cuContext;
  CUmodule cuModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int cuDevId;
  int cuDevArchitecture;
  bool first_error;

-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  CUDADeviceKernels kernels;

  static bool have_precompiled_kernels();
@@ -88,17 +63,13 @@ class CUDADevice : public Device {

  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual void transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -452,6 +452,320 @@ void *Device::get_cpu_osl_memory()
  return nullptr;
 }

+GPUDevice::~GPUDevice() noexcept(false)
+{
+}
+
+bool GPUDevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
+void GPUDevice::init_host_memory(size_t preferred_texture_headroom,
+                                 size_t preferred_working_headroom)
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower than the working one so there
+   * is space left for it. */
+  device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom :
+                                                             32 * 1024 * 1024LL;  // 32MB
+  device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom :
+                                                             128 * 1024 * 1024LL;  // 128MB
+
+  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(device_mem_map_mutex);
+    foreach (MemMap::value_type &pair, device_mem_map) {
+      device_memory &mem = *pair.first;
+      Mem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple backend devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  void *device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  bool mem_alloc_result = false;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  get_device_memory_info(total, free);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    get_device_memory_info(total, free);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = alloc_device(device_pointer, size);
+    if (mem_alloc_result) {
+      device_mem_in_use += size;
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = true;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = alloc_host(shared_pointer, size);
+
+      assert((mem_alloc_result && shared_pointer != 0) ||
+             (!mem_alloc_result && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result) {
+      transform_host_pointer(device_pointer, shared_pointer);
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (!mem_alloc_result) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      status = " failed, out of device memory";
+      set_error("System is out of GPU memory");
+    }
+    else {
+      status = " failed, out of device and host memory";
+      set_error("System is out of GPU and shared host memory");
+    }
+  }
+
+  if (mem.name) {
+    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  Mem *cmem = &device_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void GPUDevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          free_host(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      free_device((void *)mem.device_pointer);
+      device_mem_in_use -= mem.device_size;
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    device_mem_map.erase(device_mem_map.find(&mem));
+  }
+}
+
+void GPUDevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should
+   * copy data from mem.host_pointer. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size());
+  }
+}
+
 /* DeviceInfo */

 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -182,7 +182,7 @@ class Device {
  {
  }

-  /* Return true if device is ready for rendering, or report status if not. */
+  /* Report status and return true if device is ready for rendering. */
  virtual bool is_ready(string & /*status*/) const
  {
    return true;
@@ -309,6 +309,93 @@ class Device {
  static uint devices_initialized_mask;
 };

+/* Device, which is GPU, with some common functionality for GPU backends */
+class GPUDevice : public Device {
+ protected:
+  GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_),
+        texture_info(this, "texture_info", MEM_GLOBAL),
+        need_texture_info(false),
+        can_map_host(false),
+        map_host_used(0),
+        map_host_limit(0),
+        device_texture_headroom(0),
+        device_working_headroom(0),
+        device_mem_map(),
+        device_mem_map_mutex(),
+        move_texture_to_host(false),
+        device_mem_in_use(0)
+  {
+  }
+
+ public:
+  virtual ~GPUDevice() noexcept(false);
+
+  /* For GPUs that can use bindless textures in some way or another. */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  virtual bool load_texture_info();
+
+ protected:
+  /* Memory allocation, only accessed through device_memory. */
+  friend class device_memory;
+
+  bool can_map_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  typedef unsigned long long texMemObject;
+  typedef unsigned long long arrayMemObject;
+  struct Mem {
+    Mem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    texMemObject texobject;
+    arrayMemObject array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, Mem> MemMap;
+  MemMap device_mem_map;
+  thread_mutex device_mem_map_mutex;
+  bool move_texture_to_host;
+  /* Simple counter which will try to track amount of used device memory */
+  size_t device_mem_in_use;
+
+  virtual void init_host_memory(size_t preferred_texture_headroom = 0,
+                                size_t preferred_working_headroom = 0);
+  virtual void move_textures_to_host(size_t size, bool for_texture);
+
+  /* Allocation, deallocation and copy functions, with corresponding
+   * support of device/host allocations. */
+  virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+  virtual void generic_free(device_memory &mem);
+  virtual void generic_copy_to(device_memory &mem);
+
+  /* total - amount of device memory, free - amount of available device memory */
+  virtual void get_device_memory_info(size_t &total, size_t &free) = 0;
+
+  virtual bool alloc_device(void *&device_pointer, size_t size) = 0;
+
+  virtual void free_device(void *device_pointer) = 0;
+
+  virtual bool alloc_host(void *&shared_pointer, size_t size) = 0;
+
+  virtual void free_host(void *shared_pointer) = 0;
+
+  /* This function should return device pointer corresponding to shared pointer, which
+   * is host buffer, allocated in `alloc_host`. The function should `true`, if such
+   * address transformation is possible and `false` otherwise. */
+  virtual void transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0;
+
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0;
+};
+
 CCL_NAMESPACE_END

 #endif /* __DEVICE_H__ */
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -53,8 +53,12 @@ void HIPDevice::set_error(const string &error)
 }

 HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t));
+  static_assert(sizeof(arrayMemObject) == sizeof(hArray));
+
  first_error = true;

  hipDevId = info.num;
@@ -65,12 +69,6 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize HIP. */
@@ -91,7 +89,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* hipDeviceMapHost for mapping host memory when out of device memory.
   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  int value;
+  hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  can_map_host = value != 0;

  hip_assert(
      hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
@@ -460,305 +460,57 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void HIPDevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void HIPDevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    foreach (HIPMemMap::value_type &pair, hip_mem_map) {
-      device_memory &mem = *pair.first;
-      HIPMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple HIP devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void HIPDevice::get_device_memory_info(size_t &total, size_t &free)
 {
  HIPContextScope scope(this);

-  hipDeviceptr_t device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  hipError_t mem_alloc_result = hipErrorOutOfMemory;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  hipMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    hipMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = hipMalloc(&device_pointer, size);
-    if (mem_alloc_result == hipSuccess) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != hipSuccess && can_map_host) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = hipSuccess;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = hipHostMalloc(
-          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
-
-      assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
-             (mem_alloc_result != hipSuccess && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == hipSuccess) {
-      hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != hipSuccess) {
-    status = " failed, out of device and host memory";
-    set_error("System is out of GPU and shared host memory");
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  HIPMem *cmem = &hip_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * HIP memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void HIPDevice::generic_copy_to(device_memory &mem)
+bool HIPDevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  HIPContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const HIPContextScope scope(this);
-    hip_assert(
-        hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size);
+  return mem_alloc_result == hipSuccess;
 }

-void HIPDevice::generic_free(device_memory &mem)
+void HIPDevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+  HIPContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  hip_assert(hipFree((hipDeviceptr_t)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          hipHostFree(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      hip_assert(hipFree(mem.device_pointer));
-    }
+bool HIPDevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  HIPContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  hipError_t mem_alloc_result = hipHostMalloc(
+      &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);

-    hip_mem_map.erase(hip_mem_map.find(&mem));
-  }
+  return mem_alloc_result == hipSuccess;
+}
+
+void HIPDevice::free_host(void *shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hipHostFree(shared_pointer);
+}
+
+void HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0));
+}
+
+void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const HIPContextScope scope(this);
+
+  hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size));
 }

 void HIPDevice::mem_alloc(device_memory &mem)
@@ -823,8 +575,8 @@ void HIPDevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const HIPContextScope scope(this);
    hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
  }
@@ -951,19 +703,19 @@ void HIPDevice::tex_alloc(device_texture &mem)
      return;
  }

-  HIPMem *cmem = NULL;
+  Mem *cmem = NULL;
  hArray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (hArray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -1007,10 +759,10 @@ void HIPDevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@@ -1095,8 +847,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@@ -1111,9 +863,9 @@ void HIPDevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@@ -1122,16 +874,16 @@ void HIPDevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      hipArrayDestroy(cmem.array);
+      hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
@@ -1153,7 +905,7 @@ bool HIPDevice::should_use_graphics_interop()
   * possible, but from the empiric measurements it can be considerably slower than using naive
   * pixels copy. */

-  /* Disable graphics interop for now, because of driver bug in 21.40. See T92972 */
+  /* Disable graphics interop for now, because of driver bug in 21.40. See #92972 */
 #  if 0
  HIPContextScope scope(this);

--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class HIPDevice : public Device {
+class HIPDevice : public GPUDevice {

  friend class HIPContextScope;

@@ -26,36 +26,11 @@ class HIPDevice : public Device {
  hipDevice_t hipDevice;
  hipCtx_t hipContext;
  hipModule_t hipModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int hipDevId;
  int hipDevArchitecture;
  bool first_error;

-  struct HIPMem {
-    HIPMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    hipTextureObject_t texobject;
-    hArray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, HIPMem> HIPMemMap;
-  HIPMemMap hip_mem_map;
-  thread_mutex hip_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  HIPDeviceKernels kernels;

  static bool have_precompiled_kernels();
@@ -81,17 +56,13 @@ class HIPDevice : public Device {
  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual void transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/hip/util.h
+++ b/intern/cycles/device/hip/util.h
@@ -51,7 +51,7 @@ static inline bool hipSupportsDevice(const int hipDevId)
  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);

-  return (major >= 10);
+  return (major >= 9);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -247,6 +247,8 @@ class device_memory {
  bool is_resident(Device *sub_device) const;

 protected:
+  friend class Device;
+  friend class GPUDevice;
  friend class CUDADevice;
  friend class OptiXDevice;
  friend class HIPDevice;
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -29,7 +29,8 @@ class MetalDevice : public Device {
  id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
      nil; /* encoder used for fetching device pointers from MTLBuffers */
  string source[PSO_NUM];
-  string source_md5[PSO_NUM];
+  string kernels_md5[PSO_NUM];
+  string global_defines_md5[PSO_NUM];

  bool capture_enabled = false;

@@ -112,6 +113,10 @@ class MetalDevice : public Device {

  bool use_local_atomic_sort() const;

+  string preprocess_source(MetalPipelineType pso_type,
+                           const uint kernel_features,
+                           string *source = nullptr);
+
  bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);

  void make_source(MetalPipelineType pso_type, const uint kernel_features);
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -105,6 +105,7 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    }
    case METAL_GPU_AMD: {
      max_threads_per_threadgroup = 128;
+      use_metalrt = info.use_metalrt;
      break;
    }
    case METAL_GPU_APPLE: {
@@ -224,11 +225,15 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];

    // preparing the blas arg encoder
-    MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
-    arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
-    arg_desc_blas.access = MTLArgumentAccessReadOnly;
-    mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
-    [arg_desc_blas release];
+    if (@available(macos 11.0, *)) {
+      if (use_metalrt) {
+        MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
+        arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
+        arg_desc_blas.access = MTLArgumentAccessReadOnly;
+        mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
+        [arg_desc_blas release];
+      }
+    }

    for (int i = 0; i < ancillary_desc.count; i++) {
      [ancillary_desc[i] release];
@@ -294,7 +299,9 @@ bool MetalDevice::use_local_atomic_sort() const
  return DebugFlags().metal.use_local_atomic_sort;
 }

-void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
+string MetalDevice::preprocess_source(MetalPipelineType pso_type,
+                                      const uint kernel_features,
+                                      string *source)
 {
  string global_defines;
  if (use_adaptive_compilation()) {
@@ -334,6 +341,61 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
  NSOperatingSystemVersion macos_ver = [processInfo operatingSystemVersion];
  global_defines += "#define __KERNEL_METAL_MACOS__ " + to_string(macos_ver.majorVersion) + "\n";

+  /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
+   * the same character length. Build a string of all active constant values which is then hashed
+   * in order to identify the PSO.
+   */
+  if (pso_type != PSO_GENERIC) {
+    if (source) {
+      const double starttime = time_dt();
+
+#  define KERNEL_STRUCT_BEGIN(name, parent) \
+    string_replace_same_length(*source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
+
+      bool next_member_is_specialized = true;
+
+#  define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
+
+#  define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+    if (!next_member_is_specialized) { \
+      string_replace( \
+          *source, "kernel_data_" #parent "_" #name, "kernel_data." #parent ".__unused_" #name); \
+      next_member_is_specialized = true; \
+    }
+
+#  include "kernel/data_template.h"
+
+#  undef KERNEL_STRUCT_MEMBER
+#  undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
+#  undef KERNEL_STRUCT_BEGIN
+
+      metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
+    }
+
+    /* Opt in to all of available specializations. This can be made more granular for the
+     * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
+     * but the overhead should be negligible as these are very quick to (re)build and aren't
+     * serialized to disk via MTLBinaryArchives.
+     */
+    global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
+  }
+
+#  if 0
+  metal_printf("================\n%s================\n",
+               global_defines.c_str());
+#  endif
+
+  if (source) {
+    *source = global_defines + *source;
+  }
+
+  MD5Hash md5;
+  md5.append(global_defines);
+  return md5.get_hex();
+}
+
+void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
+{
  string &source = this->source[pso_type];
  source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
  source = path_source_replace_includes(source, path_get("source"));
@@ -342,62 +404,7 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
   * With Metal function constants we can generate a single variant of the kernel source which can
   * be repeatedly respecialized.
   */
-  string baked_constants;
-
-  /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
-   * the same character length. Build a string of all active constant values which is then hashed
-   * in order to identify the PSO.
-   */
-  if (pso_type != PSO_GENERIC) {
-    const double starttime = time_dt();
-
-#  define KERNEL_STRUCT_BEGIN(name, parent) \
-    string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
-
-    bool next_member_is_specialized = true;
-
-#  define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
-
-    /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */
-#  define KERNEL_STRUCT_MEMBER(parent, _type, name) \
-    if (next_member_is_specialized) { \
-      baked_constants += string(#parent "." #name "=") + \
-                         to_string(_type(launch_params.data.parent.name)) + "\n"; \
-    } \
-    else { \
-      string_replace( \
-          source, "kernel_data_" #parent "_" #name, "kernel_data." #parent ".__unused_" #name); \
-      next_member_is_specialized = true; \
-    }
-
-#  include "kernel/data_template.h"
-
-    /* Opt in to all of available specializations. This can be made more granular for the
-     * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
-     * but the overhead should be negligible as these are very quick to (re)build and aren't
-     * serialized to disk via MTLBinaryArchives.
-     */
-    global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
-
-    metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
-  }
-
-  source = global_defines + source;
-#  if 0
-  metal_printf("================\n%s================\n\%s================\n",
-               global_defines.c_str(),
-               baked_constants.c_str());
-#  endif
-
-  /* Generate an MD5 from the source and include any baked constants. This is used when caching
-   * PSOs. */
-  MD5Hash md5;
-  md5.append(baked_constants);
-  md5.append(source);
-  if (use_metalrt) {
-    md5.append(std::to_string(kernel_features & METALRT_FEATURE_MASK));
-  }
-  source_md5[pso_type] = md5.get_hex();
+  global_defines_md5[pso_type] = preprocess_source(pso_type, kernel_features, &source);
 }

 bool MetalDevice::load_kernels(const uint _kernel_features)
@@ -431,9 +438,49 @@ bool MetalDevice::load_kernels(const uint _kernel_features)

 bool MetalDevice::make_source_and_check_if_compile_needed(MetalPipelineType pso_type)
 {
-  if (this->source[pso_type].empty()) {
+  string defines_md5 = preprocess_source(pso_type, kernel_features);
+
+  /* Rebuild the source string if the injected block of #defines has changed. */
+  if (global_defines_md5[pso_type] != defines_md5) {
    make_source(pso_type, kernel_features);
  }
+
+  string constant_values;
+  if (pso_type != PSO_GENERIC) {
+    bool next_member_is_specialized = true;
+
+#  define KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE next_member_is_specialized = false;
+
+    /* Add specialization constants to md5 so that 'get_best_pipeline' is able to return a suitable
+     * match. */
+#  define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+    if (next_member_is_specialized) { \
+      constant_values += string(#parent "." #name "=") + \
+                         to_string(_type(launch_params.data.parent.name)) + "\n"; \
+    } \
+    else { \
+      next_member_is_specialized = true; \
+    }
+
+#  include "kernel/data_template.h"
+
+#  undef KERNEL_STRUCT_MEMBER
+#  undef KERNEL_STRUCT_MEMBER_DONT_SPECIALIZE
+
+#  if 0
+    metal_printf("================\n%s================\n",
+                constant_values.c_str());
+#  endif
+  }
+
+  MD5Hash md5;
+  md5.append(constant_values);
+  md5.append(source[pso_type]);
+  if (use_metalrt) {
+    md5.append(string_printf("metalrt_features=%d", kernel_features & METALRT_FEATURE_MASK));
+  }
+  kernels_md5[pso_type] = md5.get_hex();
+
  return MetalDeviceKernels::should_load_kernels(this, pso_type);
 }

@@ -520,6 +567,10 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
    thread_scoped_lock lock(existing_devices_mutex);
    if (MetalDevice *instance = get_device_by_ID(device_id, lock)) {
      if (mtlLibrary) {
+        if (error && [error localizedDescription]) {
+          VLOG_WARNING << "MSL compilation messages: " << [[error localizedDescription] UTF8String];
+        }
+
        instance->mtlLibrary[pso_type] = mtlLibrary;

        starttime = time_dt();
@@ -585,7 +636,7 @@ void MetalDevice::erase_allocation(device_memory &mem)
  if (it != metal_mem_map.end()) {
    MetalMem *mmem = it->second.get();

-    /* blank out reference to MetalMem* in the launch params (fixes crash T94736) */
+    /* blank out reference to MetalMem* in the launch params (fixes crash #94736) */
    if (mmem->pointer_index >= 0) {
      device_ptr *pointers = (device_ptr *)&launch_params;
      pointers[mmem->pointer_index] = 0;
@@ -861,6 +912,11 @@ void MetalDevice::cancel()

 bool MetalDevice::is_ready(string &status) const
 {
+  if (!error_msg.empty()) {
+    /* Avoid hanging if we had an error. */
+    return true;
+  }
+
  int num_loaded = MetalDeviceKernels::get_loaded_kernel_count(this, PSO_GENERIC);
  if (num_loaded < DEVICE_KERNEL_NUM) {
    status = string_printf("%d / %d render kernels loaded (may take a few minutes the first time)",
@@ -868,6 +924,17 @@ bool MetalDevice::is_ready(string &status) const
                           DEVICE_KERNEL_NUM);
    return false;
  }
+
+  if (int num_requests = MetalDeviceKernels::num_incomplete_specialization_requests()) {
+    status = string_printf("%d kernels to optimize", num_requests);
+  }
+  else if (kernel_specialization_level == PSO_SPECIALIZED_INTERSECT) {
+    status = "Using optimized intersection kernels";
+  }
+  else if (kernel_specialization_level == PSO_SPECIALIZED_SHADE) {
+    status = "Using optimized kernels";
+  }
+
  metal_printf("MetalDevice::is_ready(...) --> true\n");
  return true;
 }
@@ -904,7 +971,7 @@ void MetalDevice::optimize_for_scene(Scene *scene)
  }

  if (specialize_in_background) {
-    if (!MetalDeviceKernels::any_specialization_happening_now()) {
+    if (MetalDeviceKernels::num_incomplete_specialization_requests() == 0) {
      dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
                     specialize_kernels_fn);
    }
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -63,8 +63,7 @@ enum MetalPipelineType {
 };

 #  define METALRT_FEATURE_MASK \
-    (KERNEL_FEATURE_HAIR | KERNEL_FEATURE_HAIR_THICK | KERNEL_FEATURE_POINTCLOUD | \
-     KERNEL_FEATURE_OBJECT_MOTION)
+    (KERNEL_FEATURE_HAIR | KERNEL_FEATURE_HAIR_THICK | KERNEL_FEATURE_POINTCLOUD)

 const char *kernel_type_as_string(MetalPipelineType pso_type);

@@ -76,12 +75,12 @@ struct MetalKernelPipeline {

  id<MTLLibrary> mtlLibrary = nil;
  MetalPipelineType pso_type;
-  string source_md5;
+  string kernels_md5;
  size_t usage_count = 0;

  KernelData kernel_data_;
  bool use_metalrt;
-  uint32_t metalrt_features = 0;
+  uint32_t kernel_features = 0;

  int threads_per_threadgroup;

@@ -104,7 +103,7 @@ struct MetalKernelPipeline {
 /* Cache of Metal kernels for each DeviceKernel. */
 namespace MetalDeviceKernels {

-bool any_specialization_happening_now();
+int num_incomplete_specialization_requests();
 int get_loaded_kernel_count(MetalDevice const *device, MetalPipelineType pso_type);
 bool should_load_kernels(MetalDevice const *device, MetalPipelineType pso_type);
 bool load(MetalDevice *device, MetalPipelineType pso_type);
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -161,25 +161,12 @@ ShaderCache::~ShaderCache()
  running = false;
  cond_var.notify_all();

-  int num_incomplete = int(incomplete_requests);
-  if (num_incomplete) {
-    /* Shutting down the app with incomplete shader compilation requests. Give 1 second's grace for
-     * clean shutdown. */
-    metal_printf("ShaderCache busy (incomplete_requests = %d)...\n", num_incomplete);
-    std::this_thread::sleep_for(std::chrono::seconds(1));
-    num_incomplete = int(incomplete_requests);
-  }
-
-  if (num_incomplete && !MetalDeviceKernels::is_benchmark_warmup()) {
-    metal_printf("ShaderCache still busy (incomplete_requests = %d). Terminating...\n",
-                 num_incomplete);
-    std::terminate();
-  }
-
-  metal_printf("ShaderCache idle. Shutting down.\n");
+  metal_printf("Waiting for ShaderCache threads... (incomplete_requests = %d)\n",
+               int(incomplete_requests));
  for (auto &thread : compile_threads) {
    thread.join();
  }
+  metal_printf("ShaderCache shut down.\n");
 }

 void ShaderCache::wait_for_all()
@@ -292,7 +279,7 @@ bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
    /* check whether the kernel has already been requested / cached */
    thread_scoped_lock lock(cache_mutex);
    for (auto &pipeline : pipelines[device_kernel]) {
-      if (pipeline->source_md5 == device->source_md5[pso_type]) {
+      if (pipeline->kernels_md5 == device->kernels_md5[pso_type]) {
        return false;
      }
    }
@@ -332,7 +319,7 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
  memcpy(&pipeline->kernel_data_, &device->launch_params.data, sizeof(pipeline->kernel_data_));
  pipeline->pso_type = pso_type;
  pipeline->mtlDevice = mtlDevice;
-  pipeline->source_md5 = device->source_md5[pso_type];
+  pipeline->kernels_md5 = device->kernels_md5[pso_type];
  pipeline->mtlLibrary = device->mtlLibrary[pso_type];
  pipeline->device_kernel = device_kernel;
  pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
@@ -344,9 +331,7 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,

  /* metalrt options */
  pipeline->use_metalrt = device->use_metalrt;
-  pipeline->metalrt_features = device->use_metalrt ?
-                                   (device->kernel_features & METALRT_FEATURE_MASK) :
-                                   0;
+  pipeline->kernel_features = device->kernel_features;

  {
    thread_scoped_lock lock(cache_mutex);
@@ -357,65 +342,36 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,

 MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const MetalDevice *device)
 {
-  /* metalrt options */
-  bool use_metalrt = device->use_metalrt;
-  bool device_metalrt_hair = use_metalrt && device->kernel_features & KERNEL_FEATURE_HAIR;
-  bool device_metalrt_hair_thick = use_metalrt &&
-                                   device->kernel_features & KERNEL_FEATURE_HAIR_THICK;
-  bool device_metalrt_pointcloud = use_metalrt &&
-                                   device->kernel_features & KERNEL_FEATURE_POINTCLOUD;
-  bool device_metalrt_motion = use_metalrt &&
-                               device->kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
-
-  MetalKernelPipeline *best_pipeline = nullptr;
-  while (!best_pipeline) {
+  while (running) {
+    /* Search all loaded pipelines with matching kernels_md5 checksums. */
+    MetalKernelPipeline *best_match = nullptr;
    {
      thread_scoped_lock lock(cache_mutex);
-      for (auto &pipeline : pipelines[kernel]) {
-        if (!pipeline->loaded) {
-          /* still loading - ignore */
-          continue;
-        }
-
-        bool pipeline_metalrt_hair = pipeline->metalrt_features & KERNEL_FEATURE_HAIR;
-        bool pipeline_metalrt_hair_thick = pipeline->metalrt_features & KERNEL_FEATURE_HAIR_THICK;
-        bool pipeline_metalrt_pointcloud = pipeline->metalrt_features & KERNEL_FEATURE_POINTCLOUD;
-        bool pipeline_metalrt_motion = use_metalrt &&
-                                       pipeline->metalrt_features & KERNEL_FEATURE_OBJECT_MOTION;
-
-        if (pipeline->use_metalrt != use_metalrt || pipeline_metalrt_hair != device_metalrt_hair ||
-            pipeline_metalrt_hair_thick != device_metalrt_hair_thick ||
-            pipeline_metalrt_pointcloud != device_metalrt_pointcloud ||
-            pipeline_metalrt_motion != device_metalrt_motion) {
-          /* wrong combination of metalrt options */
-          continue;
-        }
-
-        if (pipeline->pso_type != PSO_GENERIC) {
-          if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
-              pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
-            best_pipeline = pipeline.get();
+      for (auto &candidate : pipelines[kernel]) {
+        if (candidate->loaded &&
+            candidate->kernels_md5 == device->kernels_md5[candidate->pso_type]) {
+          /* Replace existing match if candidate is more specialized. */
+          if (!best_match || candidate->pso_type > best_match->pso_type) {
+            best_match = candidate.get();
          }
        }
-        else if (!best_pipeline) {
-          best_pipeline = pipeline.get();
-        }
      }
    }

-    if (!best_pipeline) {
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    if (best_match) {
+      if (best_match->usage_count == 0 && best_match->pso_type != PSO_GENERIC) {
+        metal_printf("Swapping in %s version of %s\n",
+                     kernel_type_as_string(best_match->pso_type),
+                     device_kernel_as_string(kernel));
+      }
+      best_match->usage_count += 1;
+      return best_match;
    }
-  }

-  if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) {
-    metal_printf("Swapping in %s version of %s\n",
-                 kernel_type_as_string(best_pipeline->pso_type),
-                 device_kernel_as_string(kernel));
+    /* Spin until a matching kernel is loaded, or we're shutting down. */
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
  }
-  best_pipeline->usage_count += 1;
-
-  return best_pipeline;
+  return nullptr;
 }

 bool MetalKernelPipeline::should_use_binary_archive() const
@@ -428,11 +384,12 @@ bool MetalKernelPipeline::should_use_binary_archive() const
        return false;
      }
    }
-
-    /* Workaround for Intel GPU having issue using Binary Archives */
-    MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice);
-    if (gpu_vendor == METAL_GPU_INTEL) {
-      return false;
+    else {
+      /* Workaround for issues using Binary Archives on non-Apple Silicon systems. */
+      MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice);
+      if (gpu_vendor != METAL_GPU_APPLE) {
+        return false;
+      }
    }

    if (pso_type == PSO_GENERIC) {
@@ -440,8 +397,10 @@ bool MetalKernelPipeline::should_use_binary_archive() const
      return true;
    }

-    if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
-        device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+    if ((device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
+         device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) ||
+        (device_kernel >= DEVICE_KERNEL_SHADER_EVAL_DISPLACE &&
+         device_kernel <= DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY)) {
      /* Archive all shade kernels - they take a long time to compile. */
      return true;
    }
@@ -567,18 +526,14 @@ void MetalKernelPipeline::compile()
  NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
  NSArray *linked_functions = nil;

-  bool metalrt_hair = use_metalrt && (metalrt_features & KERNEL_FEATURE_HAIR);
-  bool metalrt_hair_thick = use_metalrt && (metalrt_features & KERNEL_FEATURE_HAIR_THICK);
-  bool metalrt_pointcloud = use_metalrt && (metalrt_features & KERNEL_FEATURE_POINTCLOUD);
-
  if (use_metalrt) {
    id<MTLFunction> curve_intersect_default = nil;
    id<MTLFunction> curve_intersect_shadow = nil;
    id<MTLFunction> point_intersect_default = nil;
    id<MTLFunction> point_intersect_shadow = nil;
-    if (metalrt_hair) {
+    if (kernel_features & KERNEL_FEATURE_HAIR) {
      /* Add curve intersection programs. */
-      if (metalrt_hair_thick) {
+      if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
        /* Slower programs for thick hair since that also slows down ribbons.
         * Ideally this should not be needed. */
        curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_ALL];
@@ -589,7 +544,7 @@ void MetalKernelPipeline::compile()
        curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON_SHADOW];
      }
    }
-    if (metalrt_pointcloud) {
+    if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
      point_intersect_default = rt_intersection_function[METALRT_FUNC_POINT];
      point_intersect_shadow = rt_intersection_function[METALRT_FUNC_POINT_SHADOW];
    }
@@ -666,6 +621,8 @@ void MetalKernelPipeline::compile()
  MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;

  bool use_binary_archive = should_use_binary_archive();
+  bool loading_existing_archive = false;
+  bool creating_new_archive = false;

  id<MTLBinaryArchive> archive = nil;
  string metalbin_path;
@@ -674,20 +631,11 @@ void MetalKernelPipeline::compile()
    NSProcessInfo *processInfo = [NSProcessInfo processInfo];
    string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
    MD5Hash local_md5;
-    local_md5.append(source_md5);
+    local_md5.append(kernels_md5);
    local_md5.append(osVersion);
    local_md5.append((uint8_t *)&this->threads_per_threadgroup,
                     sizeof(this->threads_per_threadgroup));

-    string options;
-    if (use_metalrt && kernel_has_intersection(device_kernel)) {
-      /* incorporate any MetalRT specializations into the archive name */
-      options += string_printf(".hair_%d.hair_thick_%d.pointcloud_%d",
-                               metalrt_hair ? 1 : 0,
-                               metalrt_hair_thick ? 1 : 0,
-                               metalrt_pointcloud ? 1 : 0);
-    }
-
    /* Replace non-alphanumerical characters with underscores. */
    string device_name = [mtlDevice.name UTF8String];
    for (char &c : device_name) {
@@ -699,77 +647,141 @@ void MetalKernelPipeline::compile()
    metalbin_name = device_name;
    metalbin_name = path_join(metalbin_name, device_kernel_as_string(device_kernel));
    metalbin_name = path_join(metalbin_name, kernel_type_as_string(pso_type));
-    metalbin_name = path_join(metalbin_name, local_md5.get_hex() + options + ".bin");
+    metalbin_name = path_join(metalbin_name, local_md5.get_hex() + ".bin");

    metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
    path_create_directories(metalbin_path);

-    /* Retrieve shader binary from disk, and update the file timestamp for LRU purging to work as
-     * intended. */
-    if (use_binary_archive && path_cache_kernel_exists_and_mark_used(metalbin_path)) {
-      if (@available(macOS 11.0, *)) {
-        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
+    /* Check if shader binary exists on disk, and if so, update the file timestamp for LRU purging
+     * to work as intended. */
+    loading_existing_archive = path_cache_kernel_exists_and_mark_used(metalbin_path);
+    creating_new_archive = !loading_existing_archive;
+
+    if (@available(macOS 11.0, *)) {
+      MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
+      if (loading_existing_archive) {
        archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
-        archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
-        [archiveDesc release];
+      }
+      NSError *error = nil;
+      archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:&error];
+      if (!archive) {
+        const char *err = error ? [[error localizedDescription] UTF8String] : nullptr;
+        metal_printf("newBinaryArchiveWithDescriptor failed: %s\n", err ? err : "nil");
+      }
+      [archiveDesc release];
+
+      if (loading_existing_archive) {
+        pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
+        computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
      }
    }
  }

-  __block bool creating_new_archive = false;
-  if (@available(macOS 11.0, *)) {
-    if (use_binary_archive) {
-      if (!archive) {
-        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
-        archiveDesc.url = nil;
-        archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
-        creating_new_archive = true;
-      }
-      computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
-      pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
+  bool recreate_archive = false;
+
+  /* Lambda to do the actual pipeline compilation. */
+  auto do_compilation = [&]() {
+    __block bool compilation_finished = false;
+    __block string error_str;
+
+    if (loading_existing_archive) {
+      /* Use the blocking variant of newComputePipelineStateWithDescriptor if an archive exists on
+       * disk. It should load almost instantaneously, and will fail gracefully when loading a
+       * corrupt archive (unlike the async variant). */
+      NSError *error = nil;
+      pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                                          options:pipelineOptions
+                                                       reflection:nullptr
+                                                            error:&error];
+      const char *err = error ? [[error localizedDescription] UTF8String] : nullptr;
+      error_str = err ? err : "nil";
    }
-  }
+    else {
+      /* TODO / MetalRT workaround:
+       * Workaround for a crash when addComputePipelineFunctionsWithDescriptor is called *after*
+       * newComputePipelineStateWithDescriptor with linked functions (i.e. with MetalRT enabled).
+       * Ideally we would like to call newComputePipelineStateWithDescriptor (async) first so we
+       * can bail out if needed, but we can stop the crash by flipping the order when there are
+       * linked functions. However when addComputePipelineFunctionsWithDescriptor is called first
+       * it will block while it builds the pipeline, offering no way of bailing out. */
+      auto addComputePipelineFunctionsWithDescriptor = [&]() {
+        if (creating_new_archive && ShaderCache::running) {
+          NSError *error;
+          if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
+                                                            error:&error]) {
+            NSString *errStr = [error localizedDescription];
+            metal_printf("Failed to add PSO to archive:\n%s\n",
+                         errStr ? [errStr UTF8String] : "nil");
+          }
+        }
+      };
+      if (linked_functions) {
+        addComputePipelineFunctionsWithDescriptor();
+      }
+
+      /* Use the async variant of newComputePipelineStateWithDescriptor if no archive exists on
+       * disk. This allows us to respond to app shutdown. */
+      [mtlDevice
+          newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
+                                        options:pipelineOptions
+                              completionHandler:^(id<MTLComputePipelineState> computePipelineState,
+                                                  MTLComputePipelineReflection *reflection,
+                                                  NSError *error) {
+                                pipeline = computePipelineState;
+
+                                /* Retain the pipeline so we can use it safely past the completion
+                                 * handler. */
+                                if (pipeline) {
+                                  [pipeline retain];
+                                }
+                                const char *err = error ?
+                                                      [[error localizedDescription] UTF8String] :
+                                                      nullptr;
+                                error_str = err ? err : "nil";
+
+                                compilation_finished = true;
+                              }];
+
+      /* Immediately wait for either the compilation to finish or for app shutdown. */
+      while (ShaderCache::running && !compilation_finished) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(5));
+      }
+
+      /* Add pipeline into the new archive (unless we did it earlier). */
+      if (pipeline && !linked_functions) {
+        addComputePipelineFunctionsWithDescriptor();
+      }
+    }
+
+    if (!pipeline) {
+      metal_printf(
+          "newComputePipelineStateWithDescriptor failed for \"%s\"%s. "
+          "Error:\n%s\n",
+          device_kernel_as_string((DeviceKernel)device_kernel),
+          (archive && !recreate_archive) ? " Archive may be incomplete or corrupt - attempting "
+                                           "recreation.." :
+                                           "",
+          error_str.c_str());
+    }
+  };

  double starttime = time_dt();

-  /* Block on load to ensure we continue with a valid kernel function */
-  if (creating_new_archive) {
-    starttime = time_dt();
-    NSError *error;
-    if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
-                                                      error:&error]) {
-      NSString *errStr = [error localizedDescription];
-      metal_printf("Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] : "nil");
-    }
-  }
+  do_compilation();

-  pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
-                                                      options:pipelineOptions
-                                                   reflection:nullptr
-                                                        error:&error];
-
-  bool recreate_archive = false;
+  /* An archive might have a corrupt entry and fail to materialize the pipeline. This shouldn't
+   * happen, but if it does we recreate it. */
  if (pipeline == nil && archive) {
-    NSString *errStr = [error localizedDescription];
-    metal_printf(
-        "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
-        "(error: %s)\n",
-        device_kernel_as_string((DeviceKernel)device_kernel),
-        errStr ? [errStr UTF8String] : "nil");
-    pipeline = [mtlDevice newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
-                                                        options:MTLPipelineOptionNone
-                                                     reflection:nullptr
-                                                          error:&error];
    recreate_archive = true;
+    pipelineOptions = MTLPipelineOptionNone;
+    path_remove(metalbin_path);
+
+    do_compilation();
  }

  double duration = time_dt() - starttime;

  if (pipeline == nil) {
-    NSString *errStr = [error localizedDescription];
-    error_str = string_printf("Failed to create compute pipeline state \"%s\", error: \n",
-                              device_kernel_as_string((DeviceKernel)device_kernel));
-    error_str += (errStr ? [errStr UTF8String] : "nil");
    metal_printf("%16s | %2d | %-55s | %7.2fs | FAILED!\n",
                 kernel_type_as_string(pso_type),
                 device_kernel,
@@ -789,7 +801,8 @@ void MetalKernelPipeline::compile()
      if (creating_new_archive || recreate_archive) {
        if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
                               error:&error]) {
-          metal_printf("Failed to save binary archive, error:\n%s\n",
+          metal_printf("Failed to save binary archive to %s, error:\n%s\n",
+                       metalbin_path.c_str(),
                       [[error localizedDescription] UTF8String]);
        }
        else {
@@ -857,16 +870,15 @@ void MetalDeviceKernels::wait_for_all()
  }
 }

-bool MetalDeviceKernels::any_specialization_happening_now()
+int MetalDeviceKernels::num_incomplete_specialization_requests()
 {
  /* Return true if any ShaderCaches have ongoing specialization requests (typically there will be
   * only 1). */
+  int total = 0;
  for (int i = 0; i < g_shaderCacheCount; i++) {
-    if (g_shaderCache[i].second->incomplete_specialization_requests > 0) {
-      return true;
-    }
+    total += g_shaderCache[i].second->incomplete_specialization_requests;
  }
-  return false;
+  return total;
 }

 int MetalDeviceKernels::get_loaded_kernel_count(MetalDevice const *device,
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -103,7 +103,7 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
  }

  /* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
-   * one. This can be overriden with CYCLES_METAL_FORCE_INTEL. */
+   * one. This can be overridden with CYCLES_METAL_FORCE_INTEL. */
  bool has_usable_amd_gpu = false;
  if (@available(macos 12.3, *)) {
    for (id<MTLDevice> device in MTLCopyAllDevices()) {
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -1437,6 +1437,9 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)

      BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
      OptixTraversableHandle handle = blas->traversable_handle;
+      if (handle == 0) {
+        continue;
+      }

      OptixInstance &instance = instances[num_instances++];
      memset(&instance, 0, sizeof(instance));
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -359,8 +359,12 @@ void PathTraceWorkCPU::guiding_push_sample_data_to_global_storage(
 #  if PATH_GUIDING_LEVEL >= 2
  const bool use_direct_light = kernel_data.integrator.use_guiding_direct_light;
  const bool use_mis_weights = kernel_data.integrator.use_guiding_mis_weights;
+#    if OPENPGL_VERSION_MINOR >= 5
+  kg->opgl_path_segment_storage->PrepareSamples(use_mis_weights, use_direct_light, false);
+#    else
  kg->opgl_path_segment_storage->PrepareSamples(
      false, nullptr, use_mis_weights, use_direct_light, false);
+#    endif
 #  endif

 #  ifdef WITH_CYCLES_DEBUG
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -886,7 +886,7 @@ int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) c
 {
  /* Special trick for fast navigation: schedule multiple samples during fast navigation
   * (which will prefer to use lower resolution to keep up with refresh rate). This gives more
-   * usable visual feedback for artists. There are a couple of tricks though. */
+   * usable visual feedback for artists. */

  if (is_denoise_active_during_update()) {
    /* When denoising is used during navigation prefer using a higher resolution with less samples
@@ -896,25 +896,12 @@ int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) c
    return 1;
  }

-  if (resolution_divider <= pixel_size_) {
-    /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
-     * the sample count at this resolution division, but instead assists in the calculation of
-     * the resolution divider. */
-    return 1;
-  }
-
-  if (resolution_divider == pixel_size_ * 2) {
-    /* When resolution divider is the previous step to the final resolution, schedule two samples.
-     * This is so that rendering on lower resolution does not exceed time that it takes to render
-     * first sample at the full resolution. */
-    return 2;
-  }
-
-  /* Always render 4 samples, even if scene is configured for less.
-   * The idea here is to have enough information on the screen. Resolution divider of 2 allows us
-   * to have 4 time extra samples, so overall worst case timing is the same as the final resolution
-   * at one sample. */
-  return 4;
+  /* Schedule samples equal to the resolution divider up to a maximum of 4.
+   * The idea is to have enough information on the screen by increasing the sample count as the
+   * resolution is decreased. */
+  /* NOTE: Changing this formula will change the formula in
+   * `RenderScheduler::calculate_resolution_divider_for_time()`. */
+  return min(max(1, resolution_divider / pixel_size_), 4);
 }

 bool RenderScheduler::work_need_adaptive_filter() const
@@ -1100,9 +1087,10 @@ void RenderScheduler::update_start_resolution_divider()
  /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
   * render time is somewhere on a boundary between two resolutions. */

-  /* Never increase resolution to higher than the pixel size (which is possible if the scene is
-   * simple and compute device is fast). */
-  start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
+  /* Don't let resolution drop below the desired one. It's better to be slow than provide an
+   * unreadable viewport render. */
+  start_resolution_divider_ = min(resolution_divider_for_update,
+                                  default_start_resolution_divider_);

  VLOG_WORK << "Calculated resolution divider is " << start_resolution_divider_;
 }
@@ -1187,24 +1175,24 @@ void RenderScheduler::check_time_limit_reached()

 int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
 {
-  /* TODO(sergey): There should a non-iterative analytical formula here. */
+  const double ratio_between_times = actual_time / desired_time;

-  int resolution_divider = 1;
+  /* We can pass `ratio_between_times` to `get_num_samples_during_navigation()` to get our
+   * navigation samples because the equation for calculating the resolution divider is as follows:
+   * `actual_time / desired_time = sqr(resolution_divider) / sample_count`.
+   * While `resolution_divider` is less than or equal to 4, `resolution_divider = sample_count`
+   * (This relationship is determined in `get_num_samples_during_navigation()`). With some
+   * substitution we end up with `actual_time / desired_time = resolution_divider` while the
+   * resolution divider is less than or equal to 4. Once the resolution divider increases above 4,
+   * the relationship of `actual_time / desired_time = resolution_divider` is no longer true,
+   * however the sample count retrieved from `get_num_samples_during_navigation()` is still
+   * accurate if we continue using this assumption. It should be noted that the interaction between
+   * `pixel_size`, sample count, and resolution divider are automatically accounted for and that's
+   * why `pixel_size` isn't included in any of the equations. */
+  const int navigation_samples = get_num_samples_during_navigation(
+      ceil_to_int(ratio_between_times));

-  /* This algorithm iterates through resolution dividers until a divider is found that achieves
-   * the desired render time. A limit of default_start_resolution_divider_ is put in place as the
-   * maximum resolution divider to avoid an unreadable viewport due to a low resolution.
-   * pre_resolution_division_samples and post_resolution_division_samples are used in this
-   * calculation to better predict the performance impact of changing resolution divisions as
-   * the sample count can also change between resolution divisions. */
-  while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
-    int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
-    resolution_divider = resolution_divider * 2;
-    int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
-    actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
-  }
-
-  return resolution_divider;
+  return ceil_to_int(sqrt(navigation_samples * ratio_between_times));
 }

 int calculate_resolution_divider_for_resolution(int width, int height, int resolution)
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -57,21 +57,29 @@ void WorkTileScheduler::reset_scheduler_state()

  VLOG_WORK << "Will schedule tiles of size " << tile_size_;

-  if (VLOG_IS_ON(3)) {
-    /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile scheduling
-     * and purely focusing on the number of used path states. */
-    const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
-                                        tile_size_.num_samples;
-    const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
-    VLOG_WORK << "Number of unused path states: "
-              << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+  const int num_path_states_in_tile = tile_size_.width * tile_size_.height *
+                                      tile_size_.num_samples;
+
+  if (num_path_states_in_tile == 0) {
+    num_tiles_x_ = 0;
+    num_tiles_y_ = 0;
+    num_tiles_per_sample_range_ = 0;
+  }
+  else {
+    if (VLOG_IS_ON(3)) {
+      /* The logging is based on multiple tiles scheduled, ignoring overhead of multi-tile
+       * scheduling and purely focusing on the number of used path states. */
+      const int num_tiles = max_num_path_states_ / num_path_states_in_tile;
+      VLOG_WORK << "Number of unused path states: "
+                << max_num_path_states_ - num_tiles * num_path_states_in_tile;
+    }
+
+    num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
+    num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
+    num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);
  }

-  num_tiles_x_ = divide_up(image_size_px_.x, tile_size_.width);
-  num_tiles_y_ = divide_up(image_size_px_.y, tile_size_.height);
-
  total_tiles_num_ = num_tiles_x_ * num_tiles_y_;
-  num_tiles_per_sample_range_ = divide_up(samples_num_, tile_size_.num_samples);

  next_work_index_ = 0;
  total_work_size_ = total_tiles_num_ * num_tiles_per_sample_range_;
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -412,11 +412,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
  # warn for other versions
  if((CUDA_VERSION STREQUAL "101") OR
     (CUDA_VERSION STREQUAL "102") OR
-     (CUDA_VERSION_MAJOR STREQUAL "11"))
+     (CUDA_VERSION_MAJOR STREQUAL "11") OR
+     (CUDA_VERSION_MAJOR STREQUAL "12"))
  else()
    message(WARNING
      "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-      "build may succeed but only CUDA 11, 10.2 and 10.1 have been tested")
+      "build may succeed but only CUDA 12, 11, 10.2 and 10.1 have been tested")
  endif()

  # build for each arch
@@ -514,6 +515,16 @@ if(WITH_CYCLES_CUDA_BINARIES)
      else()
        message(STATUS "CUDA binaries for ${arch} require CUDA 10 or earlier, skipped.")
      endif()
+    elseif(${arch} MATCHES ".*_3.")
+      if(DEFINED CUDA11_NVCC_EXECUTABLE)
+        set(cuda_nvcc_executable ${CUDA11_NVCC_EXECUTABLE})
+        set(cuda_toolkit_root_dir ${CUDA11_TOOLKIT_ROOT_DIR})
+      elseif("${CUDA_VERSION}" LESS 120) # Support for sm_35, sm_37 was removed in CUDA 12
+        set(cuda_nvcc_executable ${CUDA_NVCC_EXECUTABLE})
+        set(cuda_toolkit_root_dir ${CUDA_TOOLKIT_ROOT_DIR})
+      else()
+        message(STATUS "CUDA binaries for ${arch} require CUDA 11 or earlier, skipped.")
+      endif()
    elseif(${arch} MATCHES ".*_7." AND "${CUDA_VERSION}" LESS 100)
      message(STATUS "CUDA binaries for ${arch} require CUDA 10.0+, skipped.")
    elseif(${arch} MATCHES ".*_8.")
@@ -732,22 +743,21 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
  endif()
  # SYCL_CPP_FLAGS is a variable that the user can set to pass extra compiler options
  set(sycl_compiler_flags
-    ${CMAKE_CURRENT_SOURCE_DIR}/${SRC_KERNEL_DEVICE_ONEAPI}
-    -fsycl
-    -fsycl-unnamed-lambda
-    -fdelayed-template-parsing
-    -mllvm -inlinedefault-threshold=250
-    -mllvm -inlinehint-threshold=350
-    -fsycl-device-code-split=per_kernel
-    -fsycl-max-parallel-link-jobs=${SYCL_OFFLINE_COMPILER_PARALLEL_JOBS}
-    -shared
-    -DWITH_ONEAPI
-    -ffast-math
-    -DNDEBUG
-    -O2
-    -o ${cycles_kernel_oneapi_lib}
-    -I${CMAKE_CURRENT_SOURCE_DIR}/..
-    ${SYCL_CPP_FLAGS}
+      ${CMAKE_CURRENT_SOURCE_DIR}/${SRC_KERNEL_DEVICE_ONEAPI}
+      -fsycl
+      -fsycl-unnamed-lambda
+      -fdelayed-template-parsing
+      -mllvm -inlinedefault-threshold=250
+      -mllvm -inlinehint-threshold=350
+      -fsycl-device-code-split=per_kernel
+      -fsycl-max-parallel-link-jobs=${SYCL_OFFLINE_COMPILER_PARALLEL_JOBS}
+      -shared
+      -DWITH_ONEAPI
+      -ffast-math
+      -O2
+      -o"${cycles_kernel_oneapi_lib}"
+      -I"${CMAKE_CURRENT_SOURCE_DIR}/.."
+      ${SYCL_CPP_FLAGS}
  )

  if(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
@@ -774,14 +784,14 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
    list(APPEND sycl_compiler_flags -fsycl-targets=${targets_string})
    foreach(target ${CYCLES_ONEAPI_SYCL_TARGETS})
      if(DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_${target})
-        list(APPEND sycl_compiler_flags -Xsycl-target-backend=${target} "${CYCLES_ONEAPI_SYCL_OPTIONS_${target}}")
+        list(APPEND sycl_compiler_flags "-Xsycl-target-backend=${target} \"${CYCLES_ONEAPI_SYCL_OPTIONS_${target}}\"")
      endif()
    endforeach()
  else()
    # If AOT is disabled, build for spir64
    list(APPEND sycl_compiler_flags
      -fsycl-targets=spir64
-      -Xsycl-target-backend=spir64 "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}")
+      "-Xsycl-target-backend=spir64 \"${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}\"")
  endif()

  if(WITH_NANOVDB)
@@ -795,7 +805,6 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
  endif()

  get_filename_component(sycl_compiler_root ${SYCL_COMPILER} DIRECTORY)
-  get_filename_component(sycl_compiler_compiler_name ${SYCL_COMPILER} NAME_WE)

  if(UNIX AND NOT APPLE)
    if(NOT WITH_CXX11_ABI)
@@ -807,7 +816,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
    endif()
  endif()

-  if(WIN32)
+  if(WIN32) # Add Windows specific compiler flags.
    list(APPEND sycl_compiler_flags
    -fuse-ld=link
    -fms-extensions
@@ -834,54 +843,79 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
      get_filename_component(WINDOWS_KIT_DIR "${WINDOWS_KIT_DIR}/../" ABSOLUTE)
    endif()
    list(APPEND sycl_compiler_flags
-                -L "${MSVC_TOOLS_DIR}/lib/x64"
-                -L "${WINDOWS_KIT_DIR}/um/x64"
-                -L "${WINDOWS_KIT_DIR}/ucrt/x64")
+                -L"${MSVC_TOOLS_DIR}/lib/x64"
+                -L"${WINDOWS_KIT_DIR}/um/x64"
+                -L"${WINDOWS_KIT_DIR}/ucrt/x64")
+  else() # Add Linux specific compiler flags.
+    list(APPEND sycl_compiler_flags -fPIC)

-    set(sycl_compiler_flags_Release ${sycl_compiler_flags})
-    set(sycl_compiler_flags_Debug ${sycl_compiler_flags})
-    set(sycl_compiler_flags_RelWithDebInfo ${sycl_compiler_flags})
-    set(sycl_compiler_flags_MinSizeRel ${sycl_compiler_flags})
-    list(APPEND sycl_compiler_flags_RelWithDebInfo -g)
+    # We avoid getting __FAST_MATH__ to be defined when building on CentOS-7 and Rocky-8
+    # until the compilation issues it triggers at either AoT or JIT stages gets fixed.
+    list(APPEND sycl_compiler_flags -fhonor-nans)
+
+    # add $ORIGIN to cycles_kernel_oneapi.so rpath so libsycl.so and
+    # libpi_level_zero.so can be placed next to it and get found.
+    list(APPEND sycl_compiler_flags -Wl,-rpath,'$$ORIGIN')
+  endif()
+
+  # Create CONFIG specific compiler flags.
+  set(sycl_compiler_flags_Release ${sycl_compiler_flags})
+  set(sycl_compiler_flags_Debug ${sycl_compiler_flags})
+  set(sycl_compiler_flags_RelWithDebInfo ${sycl_compiler_flags})
+
+  list(APPEND sycl_compiler_flags_Release
+              -DNDEBUG
+              )
+  list(APPEND sycl_compiler_flags_RelWithDebInfo
+              -DNDEBUG
+              -g
+              )
+  list(APPEND sycl_compiler_flags_Debug
+              -g
+              )
+
+  if(WIN32)
    list(APPEND sycl_compiler_flags_Debug
-                -g
                -D_DEBUG
-                -nostdlib -Xclang --dependent-lib=msvcrtd)
-
+                -nostdlib
+                -Xclang --dependent-lib=msvcrtd
+                )
    add_custom_command(
      OUTPUT ${cycles_kernel_oneapi_lib} ${cycles_kernel_oneapi_linker_lib}
      COMMAND ${CMAKE_COMMAND} -E env
-              "LIB=${sycl_compiler_root}/../lib" # for compiler to find sycl.lib
+              "LIB=${sycl_compiler_root}/../lib\;${sycl_compiler_root}/../compiler/lib/intel64_win" # for compiler to find sycl.lib and in case of icpx, libircmt.lib
              "PATH=${OCLOC_INSTALL_DIR}\;${sycl_compiler_root}"
              ${SYCL_COMPILER}
              "$<$<CONFIG:Release>:${sycl_compiler_flags_Release}>"
              "$<$<CONFIG:RelWithDebInfo>:${sycl_compiler_flags_RelWithDebInfo}>"
              "$<$<CONFIG:Debug>:${sycl_compiler_flags_Debug}>"
              "$<$<CONFIG:MinSizeRel>:${sycl_compiler_flags_Release}>"
-              COMMAND_EXPAND_LISTS
-              DEPENDS ${cycles_oneapi_kernel_sources})
+      COMMAND_EXPAND_LISTS
+      DEPENDS ${cycles_oneapi_kernel_sources})
  else()
-    list(APPEND sycl_compiler_flags -fPIC)
-
-    # We avoid getting __FAST_MATH__ to be defined when building on CentOS-7 until the compilation
-    # crash it triggers at either AoT or JIT stages gets fixed.
-    # TODO: check if this is still needed on Rocky-8.
-    list(APPEND sycl_compiler_flags -fhonor-nans)
-
-    # add $ORIGIN to cycles_kernel_oneapi.so rpath so libsycl.so and
-    # libpi_level_zero.so can be placed next to it and get found.
-    list(APPEND sycl_compiler_flags -Wl,-rpath,'$$ORIGIN')
-
    if(NOT IGC_INSTALL_DIR)
      get_filename_component(IGC_INSTALL_DIR "${sycl_compiler_root}/../lib/igc" ABSOLUTE)
    endif()
+    # The following join/replace operations are to prevent cmake from
+    # escaping space chars with backslashes in add_custom_command.
+    list(JOIN sycl_compiler_flags_Release " " sycl_compiler_flags_Release_str)
+    string(REPLACE " " ";" sycl_compiler_flags_Release_str ${sycl_compiler_flags_Release_str})
+    list(JOIN sycl_compiler_flags_RelWithDebInfo " " sycl_compiler_flags_RelWithDebInfo_str)
+    string(REPLACE " " ";" sycl_compiler_flags_RelWithDebInfo_str ${sycl_compiler_flags_RelWithDebInfo_str})
+    list(JOIN sycl_compiler_flags_Debug " " sycl_compiler_flags_Debug_str)
+    string(REPLACE " " ";" sycl_compiler_flags_Debug_str ${sycl_compiler_flags_Debug_str})
    add_custom_command(
      OUTPUT ${cycles_kernel_oneapi_lib}
      COMMAND ${CMAKE_COMMAND} -E env
              "LD_LIBRARY_PATH=${sycl_compiler_root}/../lib:${OCLOC_INSTALL_DIR}/lib:${IGC_INSTALL_DIR}/lib"
              # `$ENV{PATH}` is for compiler to find `ld`.
              "PATH=${OCLOC_INSTALL_DIR}/bin:${sycl_compiler_root}:$ENV{PATH}"
-              ${SYCL_COMPILER} $<$<CONFIG:Debug>:-g>$<$<CONFIG:RelWithDebInfo>:-g> ${sycl_compiler_flags}
+              ${SYCL_COMPILER}
+              "$<$<CONFIG:Release>:${sycl_compiler_flags_Release_str}>"
+              "$<$<CONFIG:RelWithDebInfo>:${sycl_compiler_flags_RelWithDebInfo_str}>"
+              "$<$<CONFIG:Debug>:${sycl_compiler_flags_Debug_str}>"
+              "$<$<CONFIG:MinSizeRel>:${sycl_compiler_flags_Release_str}>"
+      COMMAND_EXPAND_LISTS
      DEPENDS ${cycles_oneapi_kernel_sources})
  endif()

--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -686,7 +686,7 @@ ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd,
      albedo *= ((ccl_private const PrincipledSheenBsdf *)sc)->avg_value;
      break;
    case CLOSURE_BSDF_HAIR_PRINCIPLED_ID:
-      albedo *= bsdf_principled_hair_albedo(sc);
+      albedo *= bsdf_principled_hair_albedo(sd, sc);
      break;
    default:
      break;
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -478,10 +478,18 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
  return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
 }

-ccl_device Spectrum bsdf_principled_hair_albedo(ccl_private const ShaderClosure *sc)
+ccl_device Spectrum bsdf_principled_hair_albedo(ccl_private const ShaderData *sd,
+                                                ccl_private const ShaderClosure *sc)
 {
  ccl_private PrincipledHairBSDF *bsdf = (ccl_private PrincipledHairBSDF *)sc;
-  return exp(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
+
+  const float cos_theta_o = cos_from_sin(dot(sd->wi, safe_normalize(sd->dPdu)));
+  const float cos_gamma_o = cos_from_sin(bsdf->extra->geom.w);
+  const float f = fresnel_dielectric_cos(cos_theta_o * cos_gamma_o, bsdf->eta);
+
+  const float roughness_scale = bsdf_principled_hair_albedo_roughness_scale(bsdf->v);
+  /* TODO(lukas): Adding the Fresnel term here as a workaround until the proper refactor. */
+  return exp(-sqrt(bsdf->sigma) * roughness_scale) + make_spectrum(f);
 }

 ccl_device_inline Spectrum
--- a/intern/cycles/kernel/data_template.h
+++ b/intern/cycles/kernel/data_template.h
@@ -20,6 +20,7 @@ KERNEL_STRUCT_BEGIN(KernelBackground, background)
 /* xyz store direction, w the angle. float4 instead of float3 is used
 * to ensure consistent padding/alignment across devices. */
 KERNEL_STRUCT_MEMBER(background, float4, sun)
+KERNEL_STRUCT_MEMBER(background, int, use_sun_guiding)
 /* Only shader index. */
 KERNEL_STRUCT_MEMBER(background, int, surface_shader)
 KERNEL_STRUCT_MEMBER(background, int, volume_shader)
@@ -39,6 +40,10 @@ KERNEL_STRUCT_MEMBER(background, int, use_mis)
 KERNEL_STRUCT_MEMBER(background, int, lightgroup)
 /* Light Index. */
 KERNEL_STRUCT_MEMBER(background, int, light_index)
+/* Padding. */
+KERNEL_STRUCT_MEMBER(background, int, pad1)
+KERNEL_STRUCT_MEMBER(background, int, pad2)
+KERNEL_STRUCT_MEMBER(background, int, pad3)
 KERNEL_STRUCT_END(KernelBackground)

 /* BVH: own BVH2 if no native device acceleration struct used. */
--- a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -10,7 +10,7 @@
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #  define KERNEL_STUB
 #else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+/* SSE optimization disabled for now on 32 bit, see bug #36316. */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
--- a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -10,7 +10,7 @@
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #  define KERNEL_STUB
 #else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+/* SSE optimization disabled for now on 32 bit, see bug #36316. */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #    define __KERNEL_SSE2__
 #  endif
--- a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -10,7 +10,7 @@
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #  define KERNEL_STUB
 #else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+/* SSE optimization disabled for now on 32 bit, see bug #36316. */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -645,7 +645,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
                                                          const int y,
                                                          const half4 half_pixel)
 {
-  /* Work around HIP issue with half float display, see T92972. */
+  /* Work around HIP issue with half float display, see #92972. */
 #ifdef __KERNEL_HIP__
  ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
  out[0] = half_pixel.x;
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -38,7 +38,7 @@ ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
                                                     ccl_gpu_shared int *buckets,
                                                     const ushort local_id,
                                                     const ushort local_size,
-                                                     const ushort grid_id)
+                                                     const uint grid_id)
 {
  /* Zero the bucket sizes. */
  if (local_id < max_shaders) {
@@ -89,7 +89,7 @@ ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
                                                    ccl_gpu_shared int *local_offset,
                                                    const ushort local_id,
                                                    const ushort local_size,
-                                                    const ushort grid_id)
+                                                    const uint grid_id)
 {
  /* Calculate each partition's global offset from the prefix sum of the active state counts per
   * partition. */
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -109,7 +109,7 @@ struct kernel_gpu_##name \
           const uint metal_global_id, \
           const ushort metal_local_id, \
           const ushort metal_local_size, \
-           const ushort metal_grid_id, \
+           const uint metal_grid_id, \
           uint simdgroup_size, \
           uint simd_lane_index, \
           uint simd_group_index, \
@@ -122,7 +122,7 @@ kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
                                const uint metal_global_id [[thread_position_in_grid]], \
                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
                                const ushort metal_local_size [[threads_per_threadgroup]], \
-                                const ushort metal_grid_id    [[threadgroup_position_in_grid]], \
+                                const uint metal_grid_id    [[threadgroup_position_in_grid]], \
                                uint simdgroup_size [[threads_per_simdgroup]], \
                                uint simd_lane_index [[thread_index_in_simdgroup]], \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
@@ -135,7 +135,7 @@ void kernel_gpu_##name::run(thread MetalKernelContext& context, \
                  const uint metal_global_id, \
                  const ushort metal_local_id, \
                  const ushort metal_local_size, \
-                  const ushort metal_grid_id, \
+                  const uint metal_grid_id, \
                  uint simdgroup_size, \
                  uint simd_lane_index, \
                  uint simd_group_index, \
--- a/intern/cycles/kernel/geom/motion_triangle_shader.h
+++ b/intern/cycles/kernel/geom/motion_triangle_shader.h
@@ -89,7 +89,7 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals kg,
    float u = sd->u;
    float v = sd->v;
    float w = 1.0f - u - v;
-    sd->N = (w * normals[0] + u * normals[1] + v * normals[2]);
+    sd->N = safe_normalize(w * normals[0] + u * normals[1] + v * normals[2]);
  }
 }

--- a/intern/cycles/kernel/integrator/guiding.h
+++ b/intern/cycles/kernel/integrator/guiding.h
@@ -454,8 +454,13 @@ ccl_device_forceinline bool guiding_bsdf_init(KernelGlobals kg,
                                              ccl_private float &rand)
 {
 #if defined(__PATH_GUIDING__) && PATH_GUIDING_LEVEL >= 4
+#  if OPENPGL_VERSION_MINOR >= 5
+  if (kg->opgl_surface_sampling_distribution->Init(
+          kg->opgl_guiding_field, guiding_point3f(P), rand)) {
+#  else
  if (kg->opgl_surface_sampling_distribution->Init(
          kg->opgl_guiding_field, guiding_point3f(P), rand, true)) {
+#  endif
    kg->opgl_surface_sampling_distribution->ApplyCosineProduct(guiding_point3f(N));
    return true;
  }
@@ -506,8 +511,13 @@ ccl_device_forceinline bool guiding_phase_init(KernelGlobals kg,
    return false;
  }

+#  if OPENPGL_VERSION_MINOR >= 5
+  if (kg->opgl_volume_sampling_distribution->Init(
+          kg->opgl_guiding_field, guiding_point3f(P), rand)) {
+#  else
  if (kg->opgl_volume_sampling_distribution->Init(
          kg->opgl_guiding_field, guiding_point3f(P), rand, true)) {
+#  endif
    kg->opgl_volume_sampling_distribution->ApplySingleLobeHenyeyGreensteinProduct(guiding_vec3f(D),
                                                                                  g);
    return true;
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -149,7 +149,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
            ((ls.shader & SHADER_EXCLUDE_TRANSMIT) && (path_flag & PATH_RAY_TRANSMIT)) ||
            ((ls.shader & SHADER_EXCLUDE_CAMERA) && (path_flag & PATH_RAY_CAMERA)) ||
            ((ls.shader & SHADER_EXCLUDE_SCATTER) && (path_flag & PATH_RAY_VOLUME_SCATTER)))
-          return;
+          continue;
      }
 #endif

@@ -159,7 +159,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
         * generate a firefly for small lights since it is improbable. */
        const ccl_global KernelLight *klight = &kernel_data_fetch(lights, lamp);
        if (klight->use_caustics)
-          return;
+          continue;
      }
 #endif /* __MNEE__ */

@@ -169,7 +169,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
      ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
      Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, ray_time);
      if (is_zero(light_eval)) {
-        return;
+        continue;
      }

      /* MIS weighting. */
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -619,7 +619,12 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
          const Spectrum emission = volume_emission_integrate(
              &coeff, closure_flag, transmittance, dt);
          accum_emission += result.indirect_throughput * emission;
-          guiding_record_volume_emission(kg, state, emission);
+#  if OPENPGL_VERSION_MINOR < 5  // WORKAROUND #104329
+          if (kernel_data.integrator.max_volume_bounce > 1)
+#  endif
+          {
+            guiding_record_volume_emission(kg, state, emission);
+          }
        }
      }

@@ -961,9 +966,13 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
  const Spectrum phase_weight = bsdf_eval_sum(&phase_eval) / phase_pdf;

  /* Add phase function sampling data to the path segment. */
-  guiding_record_volume_bounce(
-      kg, state, sd, phase_weight, phase_pdf, normalize(phase_wo), sampled_roughness);
-
+#  if OPENPGL_VERSION_MINOR < 5  // WORKAROUND #104329
+  if (kernel_data.integrator.max_volume_bounce > 1)
+#  endif
+  {
+    guiding_record_volume_bounce(
+        kg, state, sd, phase_weight, phase_pdf, normalize(phase_wo), sampled_roughness);
+  }
  /* Update throughput. */
  const Spectrum throughput = INTEGRATOR_STATE(state, path, throughput);
  const Spectrum throughput_phase = throughput * phase_weight;
@@ -1058,7 +1067,11 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
    const float3 direct_P = ray->P + result.direct_t * ray->D;

 #  ifdef __PATH_GUIDING__
+#    if OPENPGL_VERSION_MINOR < 5  // WORKAROUND #104329
+    if (kernel_data.integrator.use_guiding && kernel_data.integrator.max_volume_bounce > 1) {
+#    else
    if (kernel_data.integrator.use_guiding) {
+#    endif
 #    if PATH_GUIDING_LEVEL >= 1
      if (result.direct_sample_method == VOLUME_SAMPLE_DISTANCE) {
        /* If the direct scatter event is generated using VOLUME_SAMPLE_DISTANCE the direct event
@@ -1130,7 +1143,12 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
 #  if defined(__PATH_GUIDING__)
 #    if PATH_GUIDING_LEVEL >= 1
    if (!guiding_generated_new_segment) {
-      guiding_record_volume_segment(kg, state, sd.P, sd.wi);
+#      if OPENPGL_VERSION_MINOR < 5  // WORKAROUND #104329
+      if (kernel_data.integrator.max_volume_bounce > 1)
+#      endif
+      {
+        guiding_record_volume_segment(kg, state, sd.P, sd.wi);
+      }
    }
 #    endif
 #    if PATH_GUIDING_LEVEL >= 4
--- a/intern/cycles/kernel/light/area.h
+++ b/intern/cycles/kernel/light/area.h
@@ -342,7 +342,7 @@ ccl_device_forceinline void area_light_update_position(const ccl_global KernelLi
  ls->D = normalize_len(ls->P - P, &ls->t);
  ls->pdf = invarea;

-  if (klight->area.tan_half_spread > 0) {
+  if (klight->area.normalize_spread > 0) {
    ls->eval_fac = 0.25f * invarea;
    ls->eval_fac *= area_light_spread_attenuation(
        ls->D, ls->Ng, klight->area.tan_half_spread, klight->area.normalize_spread);
--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -113,13 +113,16 @@ ccl_device_noinline bool light_sample(KernelGlobals kg,
 {
  int prim;
  MeshLight mesh_light;
+#ifdef __LIGHT_TREE__
  if (kernel_data.integrator.use_light_tree) {
    ccl_global const KernelLightTreeEmitter *kemitter = &kernel_data_fetch(light_tree_emitters,
                                                                           emitter_index);
    prim = kemitter->prim;
    mesh_light = kemitter->mesh_light;
  }
-  else {
+  else
+#endif
+  {
    ccl_global const KernelLightDistribution *kdistribution = &kernel_data_fetch(
        light_distribution, emitter_index);
    prim = kdistribution->prim;
--- a/intern/cycles/kernel/osl/shaders/node_sky_texture.osl
+++ b/intern/cycles/kernel/osl/shaders/node_sky_texture.osl
@@ -132,11 +132,11 @@ color sky_radiance_nishita(vector dir, float nishita_data[10], string filename)
    /* definitions */
    vector sun_dir = geographical_to_direction(sun_elevation, sun_rotation + M_PI_2);
    float sun_dir_angle = precise_angle(dir, sun_dir);
-    float half_angular = angular_diameter / 2.0;
+    float half_angular = angular_diameter * 0.5;
    float dir_elevation = M_PI_2 - direction[0];

-    /* if ray inside sun disc render it, otherwise render sky.
-     * alternatively, ignore the sun if we're evaluating the background texture. */
+    /* If the ray is inside the sun disc, render it, otherwise render the sky.
+     * Alternatively, ignore the sun if we're evaluating the background texture. */
    if (sun_dir_angle < half_angular && sun_disc == 1 && raytype("importance_bake") != 1) {
      /* get 2 pixels data */
      color pixel_bottom = color(nishita_data[0], nishita_data[1], nishita_data[2]);
--- a/intern/cycles/kernel/sample/mapping.h
+++ b/intern/cycles/kernel/sample/mapping.h
@@ -84,8 +84,8 @@ ccl_device_inline void sample_uniform_cone(const float3 N,
 ccl_device_inline float pdf_uniform_cone(const float3 N, float3 D, float angle)
 {
  float zMin = cosf(angle);
-  float z = dot(N, D);
-  if (z > zMin) {
+  float z = precise_angle(N, D);
+  if (z < angle) {
    return M_1_2PI_F / (1.0f - zMin);
  }
  return 0.0f;
--- a/intern/cycles/kernel/svm/sky.h
+++ b/intern/cycles/kernel/svm/sky.h
@@ -138,12 +138,13 @@ ccl_device float3 sky_radiance_nishita(KernelGlobals kg,
    /* definitions */
    float3 sun_dir = geographical_to_direction(sun_elevation, sun_rotation + M_PI_2_F);
    float sun_dir_angle = precise_angle(dir, sun_dir);
-    float half_angular = angular_diameter / 2.0f;
+    float half_angular = angular_diameter * 0.5f;
    float dir_elevation = M_PI_2_F - direction.x;

-    /* if ray inside sun disc render it, otherwise render sky.
-     * alternatively, ignore the sun if we're evaluating the background texture. */
-    if (sun_disc && sun_dir_angle < half_angular && !(path_flag & PATH_RAY_IMPORTANCE_BAKE)) {
+    /* If the ray is inside the sun disc, render it, otherwise render the sky.
+     * Alternatively, ignore the sun if we're evaluating the background texture. */
+    if (sun_disc && sun_dir_angle < half_angular &&
+        !((path_flag & PATH_RAY_IMPORTANCE_BAKE) && kernel_data.background.use_sun_guiding)) {
      /* get 2 pixels data */
      float y;

--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -13,6 +13,7 @@
 #include "scene/light.h"
 #include "scene/mesh.h"
 #include "scene/object.h"
+#include "scene/osl.h"
 #include "scene/pointcloud.h"
 #include "scene/scene.h"
 #include "scene/shader.h"
@@ -25,7 +26,6 @@

 #ifdef WITH_OSL
 #  include "kernel/osl/globals.h"
-#  include "kernel/osl/services.h"
 #endif

 #include "util/foreach.h"
@@ -1717,20 +1717,7 @@ void GeometryManager::device_update_displacement_images(Device *device,
  /* If any OSL node is used for displacement, it may reference a texture. But it's
   * unknown which ones, so have to load them all. */
  if (has_osl_node) {
-    set<OSLRenderServices *> services_shared;
-    device->foreach_device([&services_shared](Device *sub_device) {
-      OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
-      services_shared.insert(og->services);
-    });
-
-    for (OSLRenderServices *services : services_shared) {
-      for (auto it = services->textures.begin(); it != services->textures.end(); ++it) {
-        if (it->second->handle.get_manager() == image_manager) {
-          const int slot = it->second->handle.svm_slot();
-          bump_images.insert(slot);
-        }
-      }
-    }
+    OSLShaderManager::osl_image_slots(device, image_manager, bump_images);
  }
 #endif

--- a/intern/cycles/scene/integrator.cpp
+++ b/intern/cycles/scene/integrator.cpp
@@ -255,8 +255,10 @@ void Integrator::device_update(Device *device, DeviceScene *dscene, Scene *scene
  kintegrator->scrambling_distance = scrambling_distance;
  kintegrator->sobol_index_mask = reverse_integer_bits(next_power_of_two(aa_samples - 1) - 1);

-  kintegrator->use_light_tree = scene->integrator->use_light_tree;
-  if (light_sampling_threshold > 0.0f) {
+  /* NOTE: The kintegrator->use_light_tree is assigned to the efficient value in the light manager,
+   * and the synchronization code is expected to tag the light manager for update when the
+   * `use_light_tree` is changed. */
+  if (light_sampling_threshold > 0.0f && !kintegrator->use_light_tree) {
    kintegrator->light_inv_rr_threshold = scene->film->get_exposure() / light_sampling_threshold;
  }
  else {
--- a/intern/cycles/scene/light.cpp
+++ b/intern/cycles/scene/light.cpp
@@ -750,7 +750,7 @@ void LightManager::device_update_background(Device *device,

        /* Determine sun direction from lat/long and texture mapping. */
        float latitude = sky->get_sun_elevation();
-        float longitude = M_2PI_F - sky->get_sun_rotation() + M_PI_2_F;
+        float longitude = sky->get_sun_rotation() + M_PI_2_F;
        float3 sun_direction = make_float3(
            cosf(latitude) * cosf(longitude), cosf(latitude) * sinf(longitude), sinf(latitude));
        Transform sky_transform = transform_inverse(sky->tex_mapping.compute_transform());
@@ -772,7 +772,8 @@ void LightManager::device_update_background(Device *device,
  }

  /* If there's more than one sun, fall back to map sampling instead. */
-  if (num_suns != 1) {
+  kbackground->use_sun_guiding = (num_suns == 1);
+  if (!kbackground->use_sun_guiding) {
    kbackground->sun_weight = 0.0f;
    environment_res.x = max(environment_res.x, 4096);
    environment_res.y = max(environment_res.y, 2048);
--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -394,7 +394,7 @@ bool OSLShaderManager::osl_compile(const string &inputfile, const string &output

  /* Compile.
   *
-   * Mutex protected because the OSL compiler does not appear to be thread safe, see T92503. */
+   * Mutex protected because the OSL compiler does not appear to be thread safe, see #92503. */
  static thread_mutex osl_compiler_mutex;
  thread_scoped_lock lock(osl_compiler_mutex);

@@ -665,6 +665,27 @@ OSLNode *OSLShaderManager::osl_node(ShaderGraph *graph,
  return node;
 }

+/* Static function, so only this file needs to be compile with RTTT. */
+void OSLShaderManager::osl_image_slots(Device *device,
+                                       ImageManager *image_manager,
+                                       set<int> &image_slots)
+{
+  set<OSLRenderServices *> services_shared;
+  device->foreach_device([&services_shared](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    services_shared.insert(og->services);
+  });
+
+  for (OSLRenderServices *services : services_shared) {
+    for (auto it = services->textures.begin(); it != services->textures.end(); ++it) {
+      if (it->second->handle.get_manager() == image_manager) {
+        const int slot = it->second->handle.svm_slot();
+        image_slots.insert(slot);
+      }
+    }
+  }
+}
+
 /* Graph Compiler */

 OSLCompiler::OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *ss, Scene *scene)
--- a/intern/cycles/scene/osl.h
+++ b/intern/cycles/scene/osl.h
@@ -92,6 +92,9 @@ class OSLShaderManager : public ShaderManager {
                           const std::string &bytecode_hash = "",
                           const std::string &bytecode = "");

+  /* Get image slots used by OSL services on device. */
+  static void osl_image_slots(Device *device, ImageManager *image_manager, set<int> &image_slots);
+
 private:
  void texture_system_init();
  void texture_system_free();
--- a/intern/cycles/scene/shader.cpp
+++ b/intern/cycles/scene/shader.cpp
@@ -153,6 +153,16 @@ static float3 output_estimate_emission(ShaderOutput *output, bool &is_constant)
      estimate *= node->get_float(strength_in->socket_type);
    }

+    /* Lower importance of emission nodes from automatic value/color to shader
+     * conversion, as these are likely used for previewing and can be slow to
+     * build a light tree for on dense meshes. */
+    if (node->type == EmissionNode::get_node_type()) {
+      EmissionNode *emission_node = static_cast<EmissionNode *>(node);
+      if (emission_node->from_auto_conversion) {
+        estimate *= 0.1f;
+      }
+    }
+
    return estimate;
  }
  else if (node->type == LightFalloffNode::get_node_type() ||
@@ -573,7 +583,7 @@ void ShaderManager::device_update_common(Device * /*device*/,
  kfilm->is_rec709 = is_rec709;
 }

-void ShaderManager::device_free_common(Device *, DeviceScene *dscene, Scene *scene)
+void ShaderManager::device_free_common(Device * /*device*/, DeviceScene *dscene, Scene * /*scene*/)
 {
  dscene->shaders.free();
 }
--- a/intern/cycles/scene/shader_graph.cpp
+++ b/intern/cycles/scene/shader_graph.cpp
@@ -260,6 +260,7 @@ void ShaderGraph::connect(ShaderOutput *from, ShaderInput *to)

    if (to->type() == SocketType::CLOSURE) {
      EmissionNode *emission = create_node<EmissionNode>();
+      emission->from_auto_conversion = true;
      emission->set_color(one_float3());
      emission->set_strength(1.0f);
      convert = add(emission);
--- a/Show More
+++ b/Show More