Merge branch 'main' into tmp-usd-alab-v2-T100452

Fix: Curve resolution input node missing default
When the attribute doesn't exist, the node should give the default of 12, as defined by the accessor method for `bke::CurvesGeometry`. Pull Request #104674
2023-02-15 00:15:27 -05:00 · 2023-02-14 19:59:37 +01:00 · 2023-02-14 15:39:32 -03:00 · 2023-02-14 15:36:35 -03:00 · 2023-02-14 15:31:30 -03:00 · 2023-02-14 14:38:18 -03:00
1225 changed files with 19451 additions and 10406 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -236,6 +236,8 @@ ForEachMacros:
  - LOOP_UNSELECTED_POINTS
  - LOOP_VISIBLE_KEYS
  - LOOP_VISIBLE_POINTS
+  - LIGHT_FOREACH_BEGIN_DIRECTIONAL
+  - LIGHT_FOREACH_BEGIN_LOCAL
  - LISTBASE_CIRCULAR_BACKWARD_BEGIN
  - LISTBASE_CIRCULAR_FORWARD_BEGIN
  - LISTBASE_FOREACH
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,5 +1,4 @@
-This repository is only used as a mirror of git.blender.org. Blender development happens on
-https://developer.blender.org.
+This repository is only used as a mirror. Blender development happens on projects.blender.org.

 To get started with contributing code, please see:
 https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -15,8 +15,7 @@ staleLabel: stale
 # Comment to post when closing a stale Issue or Pull Request.
 closeComment: >
  This issue has been automatically closed, because this repository is only
-  used as a mirror of git.blender.org. Blender development happens on
-  developer.blender.org.
+  used as a mirror. Blender development happens on projects.blender.org.

  To get started contributing code, please read:
  https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,20 +1,20 @@
 [submodule "release/scripts/addons"]
 	path = release/scripts/addons
 	url = ../blender-addons.git
-	branch = master
+	branch = main
 	ignore = all
 [submodule "release/scripts/addons_contrib"]
 	path = release/scripts/addons_contrib
 	url = ../blender-addons-contrib.git
-	branch = master
+	branch = main
 	ignore = all
 [submodule "release/datafiles/locale"]
 	path = release/datafiles/locale
 	url = ../blender-translations.git
-	branch = master
+	branch = main
 	ignore = all
 [submodule "source/tools"]
 	path = source/tools
 	url = ../blender-dev-tools.git
-	branch = master
+	branch = main
 	ignore = all
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -524,7 +524,7 @@ endif()
 if(NOT APPLE)
  option(WITH_CYCLES_DEVICE_HIP        "Enable Cycles AMD HIP support" ON)
  option(WITH_CYCLES_HIP_BINARIES      "Build Cycles AMD HIP binaries" OFF)
-  set(CYCLES_HIP_BINARIES_ARCH gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
+  set(CYCLES_HIP_BINARIES_ARCH gfx900 gfx906 gfx90c gfx902 gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 CACHE STRING "AMD HIP architectures to build binaries for")
  mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
  mark_as_advanced(CYCLES_HIP_BINARIES_ARCH)
 endif()
@@ -625,8 +625,10 @@ mark_as_advanced(

 # Vulkan
 option(WITH_VULKAN_BACKEND "Enable Vulkan as graphics backend (only for development)" OFF)
+option(WITH_VULKAN_GUARDEDALLOC "Use guardedalloc for host allocations done inside Vulkan (development option)" OFF)
 mark_as_advanced(
  WITH_VULKAN_BACKEND
+  WITH_VULKAN_GUARDEDALLOC
 )

 # Metal
--- a/8
+++ b/8
@@ -299,7 +299,11 @@ else
 	ifneq ("$(wildcard $(DEPS_BUILD_DIR)/build.ninja)","")
 		DEPS_BUILD_COMMAND:=ninja
 	else
-		DEPS_BUILD_COMMAND:=make -s
+		ifeq ($(OS), Darwin)
+			DEPS_BUILD_COMMAND:=make -s
+		else
+			DEPS_BUILD_COMMAND:="$(BLENDER_DIR)/build_files/build_environment/linux/make_deps_wrapper.sh" -s
+		endif
 	endif
 endif

@@ -398,7 +402,7 @@ endif

 deps: .FORCE
 	@echo
-	@echo Configuring dependencies in \"$(DEPS_BUILD_DIR)\"
+	@echo Configuring dependencies in \"$(DEPS_BUILD_DIR)\", install to \"$(DEPS_INSTALL_DIR)\"

 	@cmake -H"$(DEPS_SOURCE_DIR)" \
 	       -B"$(DEPS_BUILD_DIR)" \
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Development
 -----------

 - [Build Instructions](https://wiki.blender.org/wiki/Building_Blender)
- [Code Review & Bug Tracker](https://developer.blender.org)
+- [Code Review & Bug Tracker](https://projects.blender.org)
 - [Developer Forum](https://devtalk.blender.org)
 - [Developer Documentation](https://wiki.blender.org)

--- a/build_files/build_environment/cmake/gmp.cmake
+++ b/build_files/build_environment/cmake/gmp.cmake
@@ -22,7 +22,7 @@ elseif(UNIX AND NOT APPLE)
  )
 endif()

-# Boolean crashes with Arm assembly, see T103423.
+# Boolean crashes with Arm assembly, see #103423.
 if(BLENDER_PLATFORM_ARM)
  set(GMP_OPTIONS
    ${GMP_OPTIONS}
--- a/build_files/build_environment/cmake/igc.cmake
+++ b/build_files/build_environment/cmake/igc.cmake
@@ -40,7 +40,8 @@ ExternalProject_Add(external_igc_llvm
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0004-OpenCL-support-cl_ext_float_atomics.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0005-OpenCL-Add-cl_khr_integer_dot_product.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0001-Memory-leak-fix-for-Managed-Static-Mutex.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch
+    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch &&
+    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0003-Add-missing-include-limit-in-benchmark.patch
 )
 add_dependencies(
  external_igc_llvm
@@ -55,9 +56,6 @@ ExternalProject_Add(external_igc_spirv_translator
  CONFIGURE_COMMAND echo .
  BUILD_COMMAND echo .
  INSTALL_COMMAND echo .
-  PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0001-update-SPIR-V-headers-for-SPV_INTEL_split_barrier.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0002-Add-support-for-split-barriers-extension-SPV_INTEL_s.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0003-Support-cl_bf16_conversions.patch
 )
 add_dependencies(
  external_igc_spirv_translator
--- a/build_files/build_environment/cmake/python.cmake
+++ b/build_files/build_environment/cmake/python.cmake
@@ -88,6 +88,19 @@ else()
    export LDFLAGS=${PYTHON_LDFLAGS} &&
    export PKG_CONFIG_PATH=${LIBDIR}/ffi/lib/pkgconfig)

+  # NOTE: untested on APPLE so far.
+  if(NOT APPLE)
+    set(PYTHON_CONFIGURE_EXTRA_ARGS
+      ${PYTHON_CONFIGURE_EXTRA_ARGS}
+      # Used on most release Linux builds (Fedora for e.g.),
+      # increases build times noticeably with the benefit of a modest speedup at runtime.
+      --enable-optimizations
+      # While LTO is OK when building on the same system, it's incompatible across GCC versions,
+      # making it impractical for developers to build against, so keep it disabled.
+      # `--with-lto`
+    )
+  endif()
+
  ExternalProject_Add(external_python
    URL file://${PACKAGE_DIR}/${PYTHON_FILE}
    DOWNLOAD_DIR ${DOWNLOAD_DIR}
--- a/build_files/build_environment/cmake/versions.cmake
+++ b/build_files/build_environment/cmake/versions.cmake
@@ -668,9 +668,9 @@ set(SPIRV_HEADERS_FILE SPIR-V-Headers-${SPIRV_HEADERS_VERSION}.tar.gz)
 # compiler, the versions used are taken from the following location
 # https://github.com/intel/intel-graphics-compiler/releases

-set(IGC_VERSION 1.0.12149.1)
+set(IGC_VERSION 1.0.13064.7)
 set(IGC_URI https://github.com/intel/intel-graphics-compiler/archive/refs/tags/igc-${IGC_VERSION}.tar.gz)
-set(IGC_HASH 44f67f24e3bc5130f9f062533abf8154782a9d0a992bc19b498639a8521ae836)
+set(IGC_HASH a929abd4cca2b293961ec0437ee4b3b2147bd3b2c8a3c423af78c0c359b2e5ae)
 set(IGC_HASH_TYPE SHA256)
 set(IGC_FILE igc-${IGC_VERSION}.tar.gz)

@@ -690,15 +690,15 @@ set(IGC_LLVM_FILE ${IGC_LLVM_VERSION}.tar.gz)
 #
 # WARNING WARNING WARNING

-set(IGC_OPENCL_CLANG_VERSION 363a5262d8c7cff3fb28f3bdb5d85c8d7e91c1bb)
+set(IGC_OPENCL_CLANG_VERSION ee31812ea8b89d08c2918f045d11a19bd33525c5)
 set(IGC_OPENCL_CLANG_URI https://github.com/intel/opencl-clang/archive/${IGC_OPENCL_CLANG_VERSION}.tar.gz)
-set(IGC_OPENCL_CLANG_HASH aa8cf72bb239722ce8ce44f79413c6887ecc8ca18477dd520aa5c4809756da9a)
+set(IGC_OPENCL_CLANG_HASH 1db6735bbcfaa31e8a9ba39f121d6bafa806ea8919e9f56782d6aaa67771ddda)
 set(IGC_OPENCL_CLANG_HASH_TYPE SHA256)
 set(IGC_OPENCL_CLANG_FILE opencl-clang-${IGC_OPENCL_CLANG_VERSION}.tar.gz)

-set(IGC_VCINTRINSICS_VERSION v0.5.0)
+set(IGC_VCINTRINSICS_VERSION v0.11.0)
 set(IGC_VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/refs/tags/${IGC_VCINTRINSICS_VERSION}.tar.gz)
-set(IGC_VCINTRINSICS_HASH 70bb47c5e32173cf61514941e83ae7c7eb4485e6d2fca60cfa1f50d4f42c41f2)
+set(IGC_VCINTRINSICS_HASH e5acd5626ce7fa6d41ce154c50ac805eda734ee66af94ef28e680ac2ad81bb9f)
 set(IGC_VCINTRINSICS_HASH_TYPE SHA256)
 set(IGC_VCINTRINSICS_FILE vc-intrinsics-${IGC_VCINTRINSICS_VERSION}.tar.gz)

@@ -714,9 +714,9 @@ set(IGC_SPIRV_TOOLS_HASH 6e19900e948944243024aedd0a201baf3854b377b9cc7a386553bc1
 set(IGC_SPIRV_TOOLS_HASH_TYPE SHA256)
 set(IGC_SPIRV_TOOLS_FILE SPIR-V-Tools-${IGC_SPIRV_TOOLS_VERSION}.tar.gz)

-set(IGC_SPIRV_TRANSLATOR_VERSION a31ffaeef77e23d500b3ea3d35e0c42ff5648ad9)
+set(IGC_SPIRV_TRANSLATOR_VERSION d739c01d65ec00dee64dedd40deed805216a7193)
 set(IGC_SPIRV_TRANSLATOR_URI https://github.com/KhronosGroup/SPIRV-LLVM-Translator/archive/${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
-set(IGC_SPIRV_TRANSLATOR_HASH 9e26c96a45341b8f8af521bacea20e752623346340addd02af95d669f6e89252)
+set(IGC_SPIRV_TRANSLATOR_HASH ddc0cc9ccbe59dadeaf291012d59de142b2e9f2b124dbb634644d39daddaa13e)
 set(IGC_SPIRV_TRANSLATOR_HASH_TYPE SHA256)
 set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)

@@ -724,15 +724,15 @@ set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.
 ### Intel Graphics Compiler DEPS END ###
 ########################################

-set(GMMLIB_VERSION intel-gmmlib-22.1.8)
+set(GMMLIB_VERSION intel-gmmlib-22.3.0)
 set(GMMLIB_URI https://github.com/intel/gmmlib/archive/refs/tags/${GMMLIB_VERSION}.tar.gz)
-set(GMMLIB_HASH bf23e9a3742b4fb98c7666c9e9b29f3219e4b2fb4d831aaf4eed71f5e2d17368)
+set(GMMLIB_HASH c1f33e1519edfc527127baeb0436b783430dfd256c643130169a3a71dc86aff9)
 set(GMMLIB_HASH_TYPE SHA256)
 set(GMMLIB_FILE ${GMMLIB_VERSION}.tar.gz)

-set(OCLOC_VERSION 22.38.24278)
+set(OCLOC_VERSION 22.49.25018.21)
 set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC_VERSION}.tar.gz)
-set(OCLOC_HASH db0c542fccd651e6404b15a74d46027f1ce0eda8dc9e25a40cbb6c0faef257ee)
+set(OCLOC_HASH 92362dae08b503a34e5d3820ed284198c452bcd5e7504d90eb69887b20492c06)
 set(OCLOC_HASH_TYPE SHA256)
 set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)

--- a/build_files/build_environment/linux/make_deps_wrapper.sh
+++ b/build_files/build_environment/linux/make_deps_wrapper.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+# This script ensures:
+# - One dependency is built at a time.
+# - That dependency uses all available cores.
+#
+# Without this, simply calling `make -j$(nproc)` from the `${CMAKE_BUILD_DIR}/deps/`
+# directory will build all projects at once.
+#
+# This is undesirable for the following reasons:
+#
+# - The output from many projects is mixed together,
+#   making it difficult to track down the cause of a build failure.
+#
+# - Larger dependencies such as LLVM can bottleneck the build process,
+#   making it necessary to cancel the build and manually run build commands in each directory.
+#
+# - Building many projects at once means canceling (Control-C) can lead to the build being in an undefined state.
+#   It's possible canceling happens as a patch is being applied or files are being copied.
+#   (steps that aren't part of the compilation process where it's typically safe to cancel).
+
+if [[ -z "${MY_MAKE_CALL_LEVEL}" ]]; then
+  export MY_MAKE_CALL_LEVEL=0
+  export MY_MAKEFLAGS=$MAKEFLAGS
+
+  # Extract the jobs argument (`-jN`, `-j N`, `--jobs=N`).
+  add_next=0
+  for i in "$@"; do
+    case $i in
+      -j*)
+        export MY_JOBS_ARG=$i
+        if [ "$MY_JOBS_ARG" = "-j" ]; then
+            add_next=1
+        fi
+        ;;
+      --jobs=*)
+        shift # past argument=value
+        export MY_JOBS_ARG=$i
+        ;;
+      *)
+        if (( $add_next == 1 )); then
+            export MY_JOBS_ARG="$MY_JOBS_ARG $i"
+            add_next=0
+        fi
+        ;;
+    esac
+  done
+  unset i add_next
+
+  if [[ -z "${MY_JOBS_ARG}" ]]; then
+      export MY_JOBS_ARG="-j$(nproc)"
+  fi
+  # Support user defined `MAKEFLAGS`.
+  export MAKEFLAGS="$MY_MAKEFLAGS -j1"
+else
+  export MY_MAKE_CALL_LEVEL=$(( $MY_MAKE_CALL_LEVEL + 1 ))
+  if (( $MY_MAKE_CALL_LEVEL == 1 )); then
+    # Important to set jobs to 1, otherwise user defined jobs argument is used.
+    export MAKEFLAGS="$MY_MAKEFLAGS -j1"
+  elif (( $MY_MAKE_CALL_LEVEL == 2 )); then
+    # This is the level used by each sub-project.
+    export MAKEFLAGS="$MY_MAKEFLAGS $MY_JOBS_ARG"
+  fi
+  # Else leave `MY_MAKEFLAGS` flags as-is, avoids setting a high number of jobs on recursive
+  # calls (which may easily run out of memory). Let the job-server handle the rest.
+fi
+
+# Useful for troubleshooting the wrapper.
+# echo "Call level: $MY_MAKE_CALL_LEVEL, args=$@".
+
+# Call actual make but ensure recursive calls run via this script.
+exec make MAKE="$0" "$@"
--- a/build_files/build_environment/patches/igc_opencl_clang.diff
+++ b/build_files/build_environment/patches/igc_opencl_clang.diff
@@ -1,7 +1,7 @@
 diff -Naur external_igc_opencl_clang.orig/CMakeLists.txt external_igc_opencl_clang/CMakeLists.txt
 --- external_igc_opencl_clang.orig/CMakeLists.txt	2022-03-16 05:51:10 -0600
 +++ external_igc_opencl_clang/CMakeLists.txt	2022-05-23 10:40:09 -0600
-@@ -126,22 +126,24 @@
+@@ -147,22 +147,24 @@
         )
     endif()
 
--- a/build_files/cmake/buildinfo.cmake
+++ b/build_files/cmake/buildinfo.cmake
@@ -23,19 +23,19 @@ if(EXISTS ${SOURCE_DIR}/.git)

  if(MY_WC_BRANCH STREQUAL "HEAD")
    # Detached HEAD, check whether commit hash is reachable
-    # in the master branch
+    # in the main branch
    execute_process(COMMAND git rev-parse --short=12 HEAD
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE MY_WC_HASH
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

-    execute_process(COMMAND git branch --list master blender-v* --contains ${MY_WC_HASH}
+    execute_process(COMMAND git branch --list main blender-v* --contains ${MY_WC_HASH}
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE _git_contains_check
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

    if(NOT _git_contains_check STREQUAL "")
-      set(MY_WC_BRANCH "master")
+      set(MY_WC_BRANCH "main")
    else()
      execute_process(COMMAND git show-ref --tags -d
                      WORKING_DIRECTORY ${SOURCE_DIR}
@@ -48,7 +48,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
                      OUTPUT_STRIP_TRAILING_WHITESPACE)

      if(_git_tag_hashes MATCHES "${_git_head_hash}")
-        set(MY_WC_BRANCH "master")
+        set(MY_WC_BRANCH "main")
      else()
        execute_process(COMMAND git branch --contains ${MY_WC_HASH}
                        WORKING_DIRECTORY ${SOURCE_DIR}
--- a/build_files/cmake/example_scripts/cmake_linux_install.sh
+++ b/build_files/cmake/example_scripts/cmake_linux_install.sh
@@ -11,11 +11,11 @@
 mkdir ~/blender-git
 cd ~/blender-git

-git clone http://git.blender.org/blender.git
+git clone https://projects.blender.org/blender/blender.git
 cd blender
 git submodule update --init --recursive
-git submodule foreach git checkout master
-git submodule foreach git pull --rebase origin master
+git submodule foreach git checkout main
+git submodule foreach git pull --rebase origin main

 # create build dir
 mkdir ~/blender-git/build-cmake
@@ -35,7 +35,7 @@ ln -s ~/blender-git/build-cmake/bin/blender ~/blender-git/blender/blender.bin
 echo ""
 echo "* Useful Commands *"
 echo "   Run Blender: ~/blender-git/blender/blender.bin"
-echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin master"
+echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin main"
 echo "   Reconfigure Blender: cd ~/blender-git/build-cmake ; cmake ."
 echo "   Build Blender: cd ~/blender-git/build-cmake ; make"
 echo ""
--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -544,7 +544,7 @@ endfunction()
 function(setup_platform_linker_libs
  target
  )
-  # jemalloc must be early in the list, to be before pthread (see T57998)
+  # jemalloc must be early in the list, to be before pthread (see #57998).
  if(WITH_MEM_JEMALLOC)
    target_link_libraries(${target} ${JEMALLOC_LIBRARIES})
  endif()
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -440,7 +440,7 @@ string(APPEND PLATFORM_LINKFLAGS " -stdlib=libc++")
 # Make stack size more similar to Embree, required for Embree.
 string(APPEND PLATFORM_LINKFLAGS_EXECUTABLE " -Wl,-stack_size,0x100000")

-# Suppress ranlib "has no symbols" warnings (workaround for T48250)
+# Suppress ranlib "has no symbols" warnings (workaround for #48250).
 set(CMAKE_C_ARCHIVE_CREATE   "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
 set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
 # llvm-ranlib doesn't support this flag. Xcode's libtool does.
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -121,7 +121,7 @@ if(WITH_WINDOWS_BUNDLE_CRT)
  include(InstallRequiredSystemLibraries)

  # ucrtbase(d).dll cannot be in the manifest, due to the way windows 10 handles
-  # redirects for this dll, for details see T88813.
+  # redirects for this dll, for details see #88813.
  foreach(lib ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS})
    string(FIND ${lib} "ucrtbase" pos)
    if(NOT pos EQUAL -1)
@@ -295,7 +295,7 @@ unset(MATERIALX_LIB_FOLDER_EXISTS)
 if(NOT MSVC_CLANG                  AND # Available with MSVC 15.7+ but not for CLANG.
   NOT WITH_WINDOWS_SCCACHE        AND # And not when sccache is enabled
   NOT VS_CLANG_TIDY)                  # Clang-tidy does not like these options
-  add_compile_options(/experimental:external /external:templates- /external:I "${LIBDIR}" /external:W0)
+  add_compile_options(/experimental:external /external:I "${LIBDIR}" /external:W0)
 endif()

 # Add each of our libraries to our cmake_prefix_path so find_package() could work
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -5,16 +5,16 @@
 update-code:
    git:
        submodules:
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/scripts/addons
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/scripts/addons_contrib
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/datafiles/locale
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: source/tools
    svn:
--- a/build_files/utils/make_bpy_wheel.py
+++ b/build_files/utils/make_bpy_wheel.py
@@ -58,7 +58,7 @@ Each Blender release supports one Python version, and the package is only compat
 ## Source Code

 * [Releases](https://download.blender.org/source/)
-* Repository: [git.blender.org/blender.git](https://git.blender.org/gitweb/gitweb.cgi/blender.git)
+* Repository: [projects.blender.org/blender/blender.git](https://projects.blender.org/blender/blender)

 ## Credits

--- a/build_files/utils/make_update.py
+++ b/build_files/utils/make_update.py
@@ -170,7 +170,7 @@ def git_update_skip(args: argparse.Namespace, check_remote_exists: bool = True)
        return "rebase or merge in progress, complete it first"

    # Abort if uncommitted changes.
-    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no'])
+    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no', '--ignore-submodules'])
    if len(changes) != 0:
        return "you have unstaged changes"

@@ -202,8 +202,8 @@ def submodules_update(
        sys.exit(1)

    # Update submodules to appropriate given branch,
-    # falling back to master if none is given and/or found in a sub-repository.
-    branch_fallback = "master"
+    # falling back to main if none is given and/or found in a sub-repository.
+    branch_fallback = "main"
    if not branch:
        branch = branch_fallback

--- a/build_files/windows/check_submodules.cmd
+++ b/build_files/windows/check_submodules.cmd
@@ -3,9 +3,9 @@ if NOT exist "%BLENDER_DIR%\source\tools\.git" (
 	if not "%GIT%" == "" (
 		"%GIT%" submodule update --init --recursive --progress
 		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git checkout master
+		"%GIT%" submodule foreach git checkout main
 		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git pull --rebase origin master
+		"%GIT%" submodule foreach git pull --rebase origin main
 		if errorlevel 1 goto FAIL
 		goto EOF
 	) else (
--- a/doc/python_api/examples/blf.py
+++ b/doc/python_api/examples/blf.py
@@ -37,7 +37,7 @@ def draw_callback_px(self, context):
    # BLF drawing routine
    font_id = font_info["font_id"]
    blf.position(font_id, 2, 80, 0)
-    blf.size(font_id, 50, 72)
+    blf.size(font_id, 50)
    blf.draw(font_id, "Hello World")


--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -476,7 +476,7 @@ MODULE_GROUPING = {

 # -------------------------------BLENDER----------------------------------------

-# converting bytes to strings, due to T30154
+# Converting bytes to strings, due to #30154.
 BLENDER_REVISION = str(bpy.app.build_hash, 'utf_8')
 BLENDER_REVISION_TIMESTAMP = bpy.app.build_commit_timestamp

@@ -487,7 +487,7 @@ BLENDER_VERSION_DOTS = "%d.%d" % (bpy.app.version[0], bpy.app.version[1])
 if BLENDER_REVISION != "Unknown":
    # SHA1 Git hash
    BLENDER_VERSION_HASH = BLENDER_REVISION
-    BLENDER_VERSION_HASH_HTML_LINK = "<a href=https://developer.blender.org/rB%s>%s</a>" % (
+    BLENDER_VERSION_HASH_HTML_LINK = "<a href=https://projects.blender.org/blender/blender/commit/%s>%s</a>" % (
        BLENDER_VERSION_HASH, BLENDER_VERSION_HASH,
    )
    BLENDER_VERSION_DATE = time.strftime("%d/%m/%Y", time.localtime(BLENDER_REVISION_TIMESTAMP))
@@ -647,7 +647,7 @@ def undocumented_message(module_name, type_name, identifier):
        module_name, type_name, identifier,
    )

-    return "Undocumented, consider `contributing <https://developer.blender.org/T51061>`__."
+    return "Undocumented, consider `contributing <https://developer.blender.org/>`__."


 def range_str(val):
@@ -1816,9 +1816,9 @@ def pyrna2sphinx(basepath):

    # operators
    def write_ops():
-        API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts"
-        API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA"
-        API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC"
+        API_BASEURL = "https://projects.blender.org/blender/blender/src/branch/main/release/scripts"
+        API_BASEURL_ADDON = "https://projects.blender.org/blender/blender-addons"
+        API_BASEURL_ADDON_CONTRIB = "https://projects.blender.org/blender/blender-addons-contrib"

        op_modules = {}
        op = None
@@ -2200,7 +2200,7 @@ def write_rst_enum_items(basepath, key, key_no_prefix, enum_items):
    Write a single page for a static enum in RST.

    This helps avoiding very large lists being in-lined in many places which is an issue
-    especially with icons in ``bpy.types.UILayout``. See T87008.
+    especially with icons in ``bpy.types.UILayout``. See #87008.
    """
    filepath = os.path.join(basepath, "%s.rst" % key_no_prefix)
    with open(filepath, "w", encoding="utf-8") as fh:
--- a/doc/python_api/static/js/version_switch.js
+++ b/doc/python_api/static/js/version_switch.js
@@ -156,7 +156,7 @@ var Popover = function() {
    },
    getNamed : function(v) {
      $.each(all_versions, function(ix, title) {
-        if (ix === "master" || ix === "latest") {
+        if (ix === "master" || ix === "main" || ix === "latest") {
          var m = title.match(/\d\.\d[\w\d\.]*/)[0];
          if (parseFloat(m) == v) {
            v = ix;
--- a/extern/hipew/README.blender
+++ b/extern/hipew/README.blender
@@ -1,5 +1,5 @@
 Project: Blender
-URL: https://git.blender.org/blender.git
+URL: https://projects.blender.org/blender/blender.git
 License: Apache 2.0
 Upstream version: N/A
 Local modifications: None
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -12,6 +12,7 @@ from bpy.props import (
    PointerProperty,
    StringProperty,
 )
+from bpy.app.translations import pgettext_iface as iface_

 from math import pi

@@ -1664,30 +1665,48 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            col.label(text="No compatible GPUs found for Cycles", icon='INFO')

            if device_type == 'CUDA':
-                col.label(text="Requires NVIDIA GPU with compute capability 3.0", icon='BLANK1')
+                compute_capability = "3.0"
+                col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
+                          icon='BLANK1', translate=False)
            elif device_type == 'OPTIX':
-                col.label(text="Requires NVIDIA GPU with compute capability 5.0", icon='BLANK1')
-                col.label(text="and NVIDIA driver version 470 or newer", icon='BLANK1')
+                compute_capability = "5.0"
+                driver_version = "470"
+                col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
+                          icon='BLANK1', translate=False)
+                col.label(text="and NVIDIA driver version %s or newer" % driver_version,
+                          icon='BLANK1', translate=False)
            elif device_type == 'HIP':
                import sys
                if sys.platform[:3] == "win":
-                    col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
-                    col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
+                    driver_version = "21.Q4"
+                    col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
+                    col.label(text=iface_("and AMD Radeon Pro %s driver or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                elif sys.platform.startswith("linux"):
-                    col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
-                    col.label(text="and AMD driver version 22.10 or newer", icon='BLANK1')
+                    driver_version = "22.10"
+                    col.label(text="Requires AMD GPU with Vega or RDNA architecture", icon='BLANK1')
+                    col.label(text=iface_("and AMD driver version %s or newer") % driver_version, icon='BLANK1',
+                              translate=False)
            elif device_type == 'ONEAPI':
                import sys
                if sys.platform.startswith("win"):
+                    driver_version = "101.4032"
                    col.label(text="Requires Intel GPU with Xe-HPG architecture", icon='BLANK1')
-                    col.label(text="and Windows driver version 101.4032 or newer", icon='BLANK1')
+                    col.label(text=iface_("and Windows driver version %s or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                elif sys.platform.startswith("linux"):
+                    driver_version = "1.3.24931"
                    col.label(text="Requires Intel GPU with Xe-HPG architecture and", icon='BLANK1')
-                    col.label(text="  - intel-level-zero-gpu version 1.3.24931 or newer", icon='BLANK1')
+                    col.label(text=iface_("  - intel-level-zero-gpu version %s or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                    col.label(text="  - oneAPI Level-Zero Loader", icon='BLANK1')
            elif device_type == 'METAL':
-                col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
-                col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1')
+                silicon_mac_version = "12.2"
+                amd_mac_version = "12.3"
+                col.label(text=iface_("Requires Apple Silicon with macOS %s or newer") % silicon_mac_version,
+                          icon='BLANK1', translate=False)
+                col.label(text=iface_("or AMD with macOS %s or newer") % amd_mac_version, icon='BLANK1',
+                          translate=False)
            return

        for device in devices:
@@ -1723,12 +1742,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):

        if compute_device_type == 'METAL':
            import platform
-            # MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
-            # Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
-            if platform.machine() == 'arm64':
+            import re
+            is_navi_2 = False
+            for device in devices:
+                if re.search(r"((RX)|(Pro)|(PRO))\s+W?6\d00X", device.name):
+                    is_navi_2 = True
+                    break
+
+            # MetalRT only works on Apple Silicon and Navi2.
+            is_arm64 = platform.machine() == 'arm64'
+            if is_arm64 or is_navi_2:
                col = layout.column()
                col.use_property_split = True
-                col.prop(self, "kernel_optimization_level")
+                # Kernel specialization is only supported on Apple Silicon
+                if is_arm64:
+                    col.prop(self, "kernel_optimization_level")
                col.prop(self, "use_metalrt")

    def draw(self, context):
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -20,7 +20,7 @@ class CyclesPresetPanel(PresetPanel, Panel):
    @staticmethod
    def post_cb(context):
        # Modify an arbitrary built-in scene property to force a depsgraph
-        # update, because add-on properties don't. (see T62325)
+        # update, because add-on properties don't. (see #62325)
        render = context.scene.render
        render.filter_size = render.filter_size

--- a/intern/cycles/blender/display_driver.cpp
+++ b/intern/cycles/blender/display_driver.cpp
@@ -105,11 +105,12 @@ GPUShader *BlenderFallbackDisplayShader::bind(int width, int height)

  /* Bind shader now to enable uniform assignment. */
  GPU_shader_bind(shader_program_);
-  GPU_shader_uniform_int(shader_program_, image_texture_location_, 0);
+  int slot = 0;
+  GPU_shader_uniform_int_ex(shader_program_, image_texture_location_, 1, 1, &slot);
  float size[2];
  size[0] = width;
  size[1] = height;
-  GPU_shader_uniform_vector(shader_program_, fullscreen_location_, 2, 1, size);
+  GPU_shader_uniform_float_ex(shader_program_, fullscreen_location_, 2, 1, size);
  return shader_program_;
 }

--- a/intern/cycles/blender/image.cpp
+++ b/intern/cycles/blender/image.cpp
@@ -20,7 +20,7 @@ BlenderImageLoader::BlenderImageLoader(BL::Image b_image,
    : b_image(b_image),
      frame(frame),
      tile_number(tile_number),
-      /* Don't free cache for preview render to avoid race condition from T93560, to be fixed
+      /* Don't free cache for preview render to avoid race condition from #93560, to be fixed
       * properly later as we are close to release. */
      free_cache(!is_preview_render && !b_image.has_data())
 {
@@ -72,7 +72,7 @@ bool BlenderImageLoader::load_metadata(const ImageDeviceFeatures &, ImageMetaDat
    metadata.colorspace = u_colorspace_raw;
  }
  else {
-    /* In some cases (e.g. T94135), the colorspace setting in Blender gets updated as part of the
+    /* In some cases (e.g. #94135), the colorspace setting in Blender gets updated as part of the
     * metadata queries in this function, so update the colorspace setting here. */
    PointerRNA colorspace_ptr = b_image.colorspace_settings().ptr;
    metadata.colorspace = get_enum_identifier(colorspace_ptr, "name");
--- a/intern/cycles/blender/light.cpp
+++ b/intern/cycles/blender/light.cpp
@@ -24,7 +24,7 @@ void BlenderSync::sync_light(BL::Object &b_parent,
  Light *light = light_map.find(key);

  /* Check if the transform was modified, in case a linked collection is moved we do not get a
-   * specific depsgraph update (T88515). This also mimics the behavior for Objects. */
+   * specific depsgraph update (#88515). This also mimics the behavior for Objects. */
  const bool tfm_updated = (light && light->get_tfm() != tfm);

  /* Update if either object or light data changed. */
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -404,7 +404,7 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
     * point we know that we've got everything to render current view layer.
     */
    /* At the moment we only free if we are not doing multi-view
-     * (or if we are rendering the last view). See T58142/D4239 for discussion.
+     * (or if we are rendering the last view). See #58142/D4239 for discussion.
     */
    if (view_index == num_views - 1) {
      free_blender_memory_if_possible();
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -766,7 +766,7 @@ void BlenderSync::free_data_after_sync(BL::Depsgraph &b_depsgraph)
      (BlenderSession::headless || is_interface_locked) &&
      /* Baking re-uses the depsgraph multiple times, clearing crashes
       * reading un-evaluated mesh data which isn't aligned with the
-       * geometry we're baking, see T71012. */
+       * geometry we're baking, see #71012. */
      !scene->bake_manager->get_baking() &&
      /* Persistent data must main caches for performance and correctness. */
      !is_persistent_data;
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error)
 }

 CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
+  static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
+
  first_error = true;

  cuDevId = info.num;
@@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize CUDA. */
@@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  int value;
+  cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  can_map_host = value != 0;

  cuda_assert(cuDeviceGetAttribute(
      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
 {
  CUDAContextScope scope(this);

-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void CUDADevice::generic_copy_to(device_memory &mem)
+bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  CUDAContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
+  return mem_alloc_result == CUDA_SUCCESS;
 }

-void CUDADevice::generic_free(device_memory &mem)
+void CUDADevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+  CUDAContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
+bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  CUDAContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  CUresult mem_alloc_result = cuMemHostAlloc(
+      &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+  return mem_alloc_result == CUDA_SUCCESS;
+}

-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
+void CUDADevice::free_host(void *shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuMemFreeHost(shared_pointer);
+}
+
+bool CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
+  return true;
+}
+
+void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
 }

 void CUDADevice::mem_alloc(device_memory &mem)
@@ -868,8 +613,8 @@ void CUDADevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const CUDAContextScope scope(this);
    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
  }
@@ -994,19 +739,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
      return;
  }

-  CUDAMem *cmem = NULL;
+  Mem *cmem = NULL;
  CUarray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -1050,10 +795,10 @@ void CUDADevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@@ -1137,8 +882,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@@ -1153,9 +898,9 @@ void CUDADevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@@ -1164,16 +909,16 @@ void CUDADevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      cuArrayDestroy(cmem.array);
+      cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class CUDADevice : public Device {
+class CUDADevice : public GPUDevice {

  friend class CUDAContextScope;

@@ -29,36 +29,11 @@ class CUDADevice : public Device {
  CUdevice cuDevice;
  CUcontext cuContext;
  CUmodule cuModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int cuDevId;
  int cuDevArchitecture;
  bool first_error;

-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  CUDADeviceKernels kernels;

  static bool have_precompiled_kernels();
@@ -88,17 +63,13 @@ class CUDADevice : public Device {

  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -452,6 +452,320 @@ void *Device::get_cpu_osl_memory()
  return nullptr;
 }

+GPUDevice::~GPUDevice() noexcept(false)
+{
+}
+
+bool GPUDevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
+void GPUDevice::init_host_memory(size_t preferred_texture_headroom,
+                                 size_t preferred_working_headroom)
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower than the working one so there
+   * is space left for it. */
+  device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom :
+                                                             32 * 1024 * 1024LL;  // 32MB
+  device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom :
+                                                             128 * 1024 * 1024LL;  // 128MB
+
+  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(device_mem_map_mutex);
+    foreach (MemMap::value_type &pair, device_mem_map) {
+      device_memory &mem = *pair.first;
+      Mem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple backend devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  void *device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  bool mem_alloc_result = false;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  get_device_memory_info(total, free);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    get_device_memory_info(total, free);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = alloc_device(device_pointer, size);
+    if (mem_alloc_result) {
+      device_mem_in_use += size;
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = true;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = alloc_host(shared_pointer, size);
+
+      assert((mem_alloc_result && shared_pointer != 0) ||
+             (!mem_alloc_result && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result) {
+      assert(transform_host_pointer(device_pointer, shared_pointer));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (!mem_alloc_result) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      status = " failed, out of device memory";
+      set_error("System is out of GPU memory");
+    }
+    else {
+      status = " failed, out of device and host memory";
+      set_error("System is out of GPU and shared host memory");
+    }
+  }
+
+  if (mem.name) {
+    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  Mem *cmem = &device_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void GPUDevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          free_host(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      free_device((void *)mem.device_pointer);
+      device_mem_in_use -= mem.device_size;
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    device_mem_map.erase(device_mem_map.find(&mem));
+  }
+}
+
+void GPUDevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should
+   * copy data from mem.host_pointer. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size());
+  }
+}
+
 /* DeviceInfo */

 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -309,6 +309,93 @@ class Device {
  static uint devices_initialized_mask;
 };

+/* Device, which is GPU, with some common functionality for GPU backends */
+class GPUDevice : public Device {
+ protected:
+  GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_),
+        texture_info(this, "texture_info", MEM_GLOBAL),
+        need_texture_info(false),
+        can_map_host(false),
+        map_host_used(0),
+        map_host_limit(0),
+        device_texture_headroom(0),
+        device_working_headroom(0),
+        device_mem_map(),
+        device_mem_map_mutex(),
+        move_texture_to_host(false),
+        device_mem_in_use(0)
+  {
+  }
+
+ public:
+  virtual ~GPUDevice() noexcept(false);
+
+  /* For GPUs that can use bindless textures in some way or another. */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  virtual bool load_texture_info();
+
+ protected:
+  /* Memory allocation, only accessed through device_memory. */
+  friend class device_memory;
+
+  bool can_map_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  typedef unsigned long long texMemObject;
+  typedef unsigned long long arrayMemObject;
+  struct Mem {
+    Mem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    texMemObject texobject;
+    arrayMemObject array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, Mem> MemMap;
+  MemMap device_mem_map;
+  thread_mutex device_mem_map_mutex;
+  bool move_texture_to_host;
+  /* Simple counter which will try to track amount of used device memory */
+  size_t device_mem_in_use;
+
+  virtual void init_host_memory(size_t preferred_texture_headroom = 0,
+                                size_t preferred_working_headroom = 0);
+  virtual void move_textures_to_host(size_t size, bool for_texture);
+
+  /* Allocation, deallocation and copy functions, with corresponding
+   * support of device/host allocations. */
+  virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+  virtual void generic_free(device_memory &mem);
+  virtual void generic_copy_to(device_memory &mem);
+
+  /* total - amount of device memory, free - amount of available device memory */
+  virtual void get_device_memory_info(size_t &total, size_t &free) = 0;
+
+  virtual bool alloc_device(void *&device_pointer, size_t size) = 0;
+
+  virtual void free_device(void *device_pointer) = 0;
+
+  virtual bool alloc_host(void *&shared_pointer, size_t size) = 0;
+
+  virtual void free_host(void *shared_pointer) = 0;
+
+  /* This function should return device pointer corresponding to shared pointer, which
+   * is host buffer, allocated in `alloc_host`. The function should `true`, if such
+   * address transformation is possible and `false` otherwise. */
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0;
+
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0;
+};
+
 CCL_NAMESPACE_END

 #endif /* __DEVICE_H__ */
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -53,8 +53,12 @@ void HIPDevice::set_error(const string &error)
 }

 HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t));
+  static_assert(sizeof(arrayMemObject) == sizeof(hArray));
+
  first_error = true;

  hipDevId = info.num;
@@ -65,12 +69,6 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize HIP. */
@@ -91,7 +89,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* hipDeviceMapHost for mapping host memory when out of device memory.
   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  int value;
+  hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  can_map_host = value != 0;

  hip_assert(
      hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
@@ -460,305 +460,58 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void HIPDevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void HIPDevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    foreach (HIPMemMap::value_type &pair, hip_mem_map) {
-      device_memory &mem = *pair.first;
-      HIPMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple HIP devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void HIPDevice::get_device_memory_info(size_t &total, size_t &free)
 {
  HIPContextScope scope(this);

-  hipDeviceptr_t device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  hipError_t mem_alloc_result = hipErrorOutOfMemory;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  hipMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    hipMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = hipMalloc(&device_pointer, size);
-    if (mem_alloc_result == hipSuccess) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != hipSuccess && can_map_host) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = hipSuccess;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = hipHostMalloc(
-          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
-
-      assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
-             (mem_alloc_result != hipSuccess && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == hipSuccess) {
-      hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != hipSuccess) {
-    status = " failed, out of device and host memory";
-    set_error("System is out of GPU and shared host memory");
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  HIPMem *cmem = &hip_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * HIP memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void HIPDevice::generic_copy_to(device_memory &mem)
+bool HIPDevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  HIPContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const HIPContextScope scope(this);
-    hip_assert(
-        hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size);
+  return mem_alloc_result == hipSuccess;
 }

-void HIPDevice::generic_free(device_memory &mem)
+void HIPDevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+  HIPContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  hip_assert(hipFree((hipDeviceptr_t)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          hipHostFree(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      hip_assert(hipFree(mem.device_pointer));
-    }
+bool HIPDevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  HIPContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  hipError_t mem_alloc_result = hipHostMalloc(
+      &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);

-    hip_mem_map.erase(hip_mem_map.find(&mem));
-  }
+  return mem_alloc_result == hipSuccess;
+}
+
+void HIPDevice::free_host(void *shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hipHostFree(shared_pointer);
+}
+
+bool HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0));
+  return true;
+}
+
+void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const HIPContextScope scope(this);
+
+  hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size));
 }

 void HIPDevice::mem_alloc(device_memory &mem)
@@ -823,8 +576,8 @@ void HIPDevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const HIPContextScope scope(this);
    hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
  }
@@ -951,19 +704,19 @@ void HIPDevice::tex_alloc(device_texture &mem)
      return;
  }

-  HIPMem *cmem = NULL;
+  Mem *cmem = NULL;
  hArray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (hArray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -1007,10 +760,10 @@ void HIPDevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@@ -1095,8 +848,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@@ -1111,9 +864,9 @@ void HIPDevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@@ -1122,16 +875,16 @@ void HIPDevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      hipArrayDestroy(cmem.array);
+      hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
@@ -1153,7 +906,7 @@ bool HIPDevice::should_use_graphics_interop()
   * possible, but from the empiric measurements it can be considerably slower than using naive
   * pixels copy. */

-  /* Disable graphics interop for now, because of driver bug in 21.40. See T92972 */
+  /* Disable graphics interop for now, because of driver bug in 21.40. See #92972 */
 #  if 0
  HIPContextScope scope(this);

--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class HIPDevice : public Device {
+class HIPDevice : public GPUDevice {

  friend class HIPContextScope;

@@ -26,36 +26,11 @@ class HIPDevice : public Device {
  hipDevice_t hipDevice;
  hipCtx_t hipContext;
  hipModule_t hipModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int hipDevId;
  int hipDevArchitecture;
  bool first_error;

-  struct HIPMem {
-    HIPMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    hipTextureObject_t texobject;
-    hArray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, HIPMem> HIPMemMap;
-  HIPMemMap hip_mem_map;
-  thread_mutex hip_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  HIPDeviceKernels kernels;

  static bool have_precompiled_kernels();
@@ -81,17 +56,13 @@ class HIPDevice : public Device {
  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/hip/util.h
+++ b/intern/cycles/device/hip/util.h
@@ -51,7 +51,7 @@ static inline bool hipSupportsDevice(const int hipDevId)
  hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
  hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);

-  return (major >= 10);
+  return (major >= 9);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -73,6 +73,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
      return "integrator_terminated_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+      return "integrator_sort_bucket_pass";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
+      return "integrator_sort_write_pass";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
      return "integrator_compact_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -247,6 +247,8 @@ class device_memory {
  bool is_resident(Device *sub_device) const;

 protected:
+  friend class Device;
+  friend class GPUDevice;
  friend class CUDADevice;
  friend class OptiXDevice;
  friend class HIPDevice;
--- a/intern/cycles/device/metal/bvh.h
+++ b/intern/cycles/device/metal/bvh.h
@@ -21,6 +21,7 @@ class BVHMetal : public BVH {

  API_AVAILABLE(macos(11.0))
  vector<id<MTLAccelerationStructure>> blas_array;
+  vector<uint32_t> blas_lookup;

  bool motion_blur = false;

--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress,

    uint32_t instance_index = 0;
    uint32_t motion_transform_index = 0;
+
+    // allocate look up buffer for wost case scenario
+    uint64_t count = objects.size();
+    blas_lookup.resize(count);
+
    for (Object *ob : objects) {
      /* Skip non-traceable objects */
      if (!ob->is_traceable())
@@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress,
      /* Set user instance ID to object index */
      int object_index = ob->get_device_index();
      uint32_t user_id = uint32_t(object_index);
+      int currIndex = instance_index++;
+      assert(user_id < blas_lookup.size());
+      blas_lookup[user_id] = accel_struct_index;

      /* Bake into the appropriate descriptor */
      if (motion_blur) {
        MTLAccelerationStructureMotionInstanceDescriptor *instances =
            (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];

        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
@@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
      else {
        MTLAccelerationStructureUserIDInstanceDescriptor *instances =
            (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];

        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
--- a/intern/cycles/device/metal/device.mm
+++ b/intern/cycles/device/metal/device.mm
@@ -55,6 +55,10 @@ void device_metal_info(vector<DeviceInfo> &devices)
    info.denoisers = DENOISER_NONE;
    info.id = id;

+    if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
+      info.has_light_tree = false;
+    }
+
    devices.push_back(info);
    device_index++;
  }
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -74,6 +74,11 @@ class MetalDevice : public Device {
  id<MTLBuffer> texture_bindings_3d = nil;
  std::vector<id<MTLTexture>> texture_slot_map;

+  /* BLAS encoding & lookup */
+  id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
+  id<MTLBuffer> blas_buffer = nil;
+  id<MTLBuffer> blas_lookup_buffer = nil;
+
  bool use_metalrt = false;
  MetalPipelineType kernel_specialization_level = PSO_GENERIC;

@@ -105,6 +110,8 @@ class MetalDevice : public Device {

  bool use_adaptive_compilation();

+  bool use_local_atomic_sort() const;
+
  bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);

  void make_source(MetalPipelineType pso_type, const uint kernel_features);
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -105,6 +105,7 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    }
    case METAL_GPU_AMD: {
      max_threads_per_threadgroup = 128;
+      use_metalrt = info.use_metalrt;
      break;
    }
    case METAL_GPU_APPLE: {
@@ -192,6 +193,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
        arg_desc_as.access = MTLArgumentAccessReadOnly;

+        MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
+        arg_desc_ptrs.dataType = MTLDataTypePointer;
+        arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
+
        MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
        arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
        arg_desc_ift.access = MTLArgumentAccessReadOnly;
@@ -204,14 +209,28 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
        arg_desc_ift.index = index++;
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
+        arg_desc_ift.index = index++;
+        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
+        arg_desc_ptrs.index = index++;
+        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
+        arg_desc_ptrs.index = index++;
+        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */

        [arg_desc_ift release];
        [arg_desc_as release];
+        [arg_desc_ptrs release];
      }
    }

    mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];

+    // preparing the blas arg encoder
+    MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
+    arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
+    arg_desc_blas.access = MTLArgumentAccessReadOnly;
+    mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
+    [arg_desc_blas release];
+
    for (int i = 0; i < ancillary_desc.count; i++) {
      [ancillary_desc[i] release];
    }
@@ -271,6 +290,11 @@ bool MetalDevice::use_adaptive_compilation()
  return DebugFlags().metal.adaptive_compile;
 }

+bool MetalDevice::use_local_atomic_sort() const
+{
+  return DebugFlags().metal.use_local_atomic_sort;
+}
+
 void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
 {
  string global_defines;
@@ -278,6 +302,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }

+  if (use_local_atomic_sort()) {
+    global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
+  }
+
  if (use_metalrt) {
    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
@@ -558,7 +586,7 @@ void MetalDevice::erase_allocation(device_memory &mem)
  if (it != metal_mem_map.end()) {
    MetalMem *mmem = it->second.get();

-    /* blank out reference to MetalMem* in the launch params (fixes crash T94736) */
+    /* blank out reference to MetalMem* in the launch params (fixes crash #94736) */
    if (mmem->pointer_index >= 0) {
      device_ptr *pointers = (device_ptr *)&launch_params;
      pointers[mmem->pointer_index] = 0;
@@ -1231,6 +1259,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    if (@available(macos 11.0, *)) {
      if (bvh->params.top_level) {
        bvhMetalRT = bvh_metal;
+
+        // allocate required buffers for BLAS array
+        uint64_t count = bvhMetalRT->blas_array.size();
+        uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
+        blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
+        stats.mem_alloc(blas_buffer.allocatedSize);
+
+        for (uint64_t i = 0; i < count; ++i) {
+          [mtlBlasArgEncoder setArgumentBuffer:blas_buffer
+                                        offset:i * mtlBlasArgEncoder.encodedLength];
+          [mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
+        }
+
+        count = bvhMetalRT->blas_lookup.size();
+        bufferSize = sizeof(uint32_t) * count;
+        blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
+                                                    options:default_storage_mode];
+        stats.mem_alloc(blas_lookup_buffer.allocatedSize);
+
+        memcpy([blas_lookup_buffer contents],
+               bvhMetalRT -> blas_lookup.data(),
+               blas_lookup_buffer.allocatedSize);
+
+        if (default_storage_mode == MTLResourceStorageModeManaged) {
+          [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
+          [blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
+        }
      }
    }
  }
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -19,6 +19,8 @@ enum {
  METALRT_FUNC_SHADOW_BOX,
  METALRT_FUNC_LOCAL_TRI,
  METALRT_FUNC_LOCAL_BOX,
+  METALRT_FUNC_LOCAL_TRI_PRIM,
+  METALRT_FUNC_LOCAL_BOX_PRIM,
  METALRT_FUNC_CURVE_RIBBON,
  METALRT_FUNC_CURVE_RIBBON_SHADOW,
  METALRT_FUNC_CURVE_ALL,
@@ -28,7 +30,13 @@ enum {
  METALRT_FUNC_NUM
 };

-enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
+enum {
+  METALRT_TABLE_DEFAULT,
+  METALRT_TABLE_SHADOW,
+  METALRT_TABLE_LOCAL,
+  METALRT_TABLE_LOCAL_PRIM,
+  METALRT_TABLE_NUM
+};

 /* Pipeline State Object types */
 enum MetalPipelineType {
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -87,6 +87,9 @@ struct ShaderCache {
          break;
      }
    }
+
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
  }
  ~ShaderCache();

@@ -521,6 +524,8 @@ void MetalKernelPipeline::compile()
          "__anyhit__cycles_metalrt_shadow_all_hit_box",
          "__anyhit__cycles_metalrt_local_hit_tri",
          "__anyhit__cycles_metalrt_local_hit_box",
+          "__anyhit__cycles_metalrt_local_hit_tri_prim",
+          "__anyhit__cycles_metalrt_local_hit_box_prim",
          "__intersection__curve_ribbon",
          "__intersection__curve_ribbon_shadow",
          "__intersection__curve_all",
@@ -611,11 +616,17 @@ void MetalKernelPipeline::compile()
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         nil];
+    table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
+        arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+                         nil];

    NSMutableSet *unique_functions = [NSMutableSet
        setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
+    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];

    if (kernel_has_intersection(device_kernel)) {
      linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -25,6 +25,7 @@ class MetalDeviceQueue : public DeviceQueue {
  virtual int num_concurrent_states(const size_t) const override;
  virtual int num_concurrent_busy_states(const size_t) const override;
  virtual int num_sort_partition_elements() const override;
+  virtual bool supports_local_atomic_sort() const override;

  virtual void init_execution() override;

--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -315,6 +315,11 @@ int MetalDeviceQueue::num_sort_partition_elements() const
  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
 }

+bool MetalDeviceQueue::supports_local_atomic_sort() const
+{
+  return metal_device_->use_local_atomic_sort();
+}
+
 void MetalDeviceQueue::init_execution()
 {
  /* Synchronize all textures and memory copies before executing task. */
@@ -477,6 +482,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (metal_device_->bvhMetalRT) {
        id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
        [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
+        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
+                                                  offset:0
+                                                 atIndex:7];
+        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
+                                                  offset:0
+                                                 atIndex:8];
      }

      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@@ -527,6 +538,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (bvhMetalRT) {
        /* Mark all Accelerations resources as used */
        [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
+                                        usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResource:metal_device_->blas_lookup_buffer
+                                        usage:MTLResourceUsageRead];
        [mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
                                         count:bvhMetalRT->blas_array.size()
                                         usage:MTLResourceUsageRead];
@@ -553,13 +568,24 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      /* See parallel_active_index.h for why this amount of shared memory is needed.
       * Rounded up to 16 bytes for Metal */
      shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
-      [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
      break;

+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+      int key_count = metal_device_->launch_params.data.max_shaders;
+      shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
+      break;
+    }
+
    default:
      break;
  }

+  if (shared_mem_bytes) {
+    assert(shared_mem_bytes <= 32 * 1024);
+    [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
+  }
+
  MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
      divide_up(work_size, num_threads_per_block), 1, 1);
  MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -64,6 +64,12 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
    return METAL_GPU_INTEL;
  }
  else if (strstr(device_name, "AMD")) {
+    /* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */
+    if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) {
+      if (atoi(str)) {
+        return METAL_GPU_UNKNOWN;
+      }
+    }
    return METAL_GPU_AMD;
  }
  else if (strstr(device_name, "Apple")) {
@@ -96,6 +102,15 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    return usable_devices;
  }

+  /* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
+   * one. This can be overridden with CYCLES_METAL_FORCE_INTEL. */
+  bool has_usable_amd_gpu = false;
+  if (@available(macos 12.3, *)) {
+    for (id<MTLDevice> device in MTLCopyAllDevices()) {
+      has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD);
+    }
+  }
+
  metal_printf("Usable Metal devices:\n");
  for (id<MTLDevice> device in MTLCopyAllDevices()) {
    string device_name = get_device_name(device);
@@ -111,8 +126,10 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    }

 #  if defined(MAC_OS_VERSION_13_0)
-    if (@available(macos 13.0, *)) {
-      usable |= (vendor == METAL_GPU_INTEL);
+    if (!has_usable_amd_gpu) {
+      if (@available(macos 13.0, *)) {
+        usable |= (vendor == METAL_GPU_INTEL);
+      }
    }
 #  endif

--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -854,12 +854,14 @@ bool OptiXDevice::load_osl_kernels()
        context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
  }

+  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
  vector<OptixStackSizes> osl_stack_size(osl_groups.size());

  /* Update SBT with new entries. */
  sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
  }
  for (size_t i = 0; i < osl_groups.size(); ++i) {
    if (osl_groups[i] != NULL) {
@@ -907,13 +909,15 @@ bool OptiXDevice::load_osl_kernels()
                                     0,
                                     &pipelines[PIP_SHADE]));

+    const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
+                                      stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
    unsigned int dss = 0;
    for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
      dss = std::max(dss, osl_stack_size[i].dssDC);
    }

    optix_assert(optixPipelineSetStackSize(
-        pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
+        pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
  }

  return !have_error();
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -112,6 +112,13 @@ class DeviceQueue {
    return 65536;
  }

+  /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
+   * INTEGRATOR_SORT_WRITE_PASS)? */
+  virtual bool supports_local_atomic_sort() const
+  {
+    return false;
+  }
+
  /* Initialize execution of kernels on this queue.
   *
   * Will, for example, load all data required by the kernels from Device to global or path state.
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -71,6 +71,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
          device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
      integrator_shader_sort_prefix_sum_(
          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
+      integrator_shader_sort_partition_key_offsets_(
+          device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
      integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
      integrator_next_shadow_path_index_(
          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
@@ -207,33 +209,45 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
                                                                num_sort_partitions_);

-  /* Allocate arrays for shader sorting. */
-  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
-  if (integrator_shader_sort_counter_.size() < sort_buckets) {
-    integrator_shader_sort_counter_.alloc(sort_buckets);
-    integrator_shader_sort_counter_.zero_to_device();
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
-        (int *)integrator_shader_sort_counter_.device_pointer;
-
-    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
-    integrator_shader_sort_prefix_sum_.zero_to_device();
-  }
-
-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-    if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
-      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
-      integrator_shader_raytrace_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
-          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    /* Allocate array for partitioned shader sorting using local atomics. */
+    const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
+    if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
+      integrator_shader_sort_partition_key_offsets_.alloc(num_offsets);
+      integrator_shader_sort_partition_key_offsets_.zero_to_device();
    }
+    integrator_state_gpu_.sort_partition_key_offsets =
+        (int *)integrator_shader_sort_partition_key_offsets_.device_pointer;
  }
+  else {
+    /* Allocate arrays for shader sorting. */
+    const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+    if (integrator_shader_sort_counter_.size() < sort_buckets) {
+      integrator_shader_sort_counter_.alloc(sort_buckets);
+      integrator_shader_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+          (int *)integrator_shader_sort_counter_.device_pointer;

-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
-    if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
-      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
-      integrator_shader_mnee_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
-          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
+      integrator_shader_sort_prefix_sum_.zero_to_device();
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
+        integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
+        integrator_shader_raytrace_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+            (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+      }
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+      if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
+        integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+        integrator_shader_mnee_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+            (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      }
    }
  }
 }
@@ -451,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
    work_size = num_queued;
    d_path_index = queued_paths_.device_pointer;

-    compute_sorted_queued_paths(
-        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
+    compute_sorted_queued_paths(kernel, num_paths_limit);
  }
  else if (num_queued < work_size) {
    work_size = num_queued;
@@ -511,11 +524,26 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
  }
 }

-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
-                                                   DeviceKernel queued_kernel,
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
                                                   const int num_paths_limit)
 {
  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    const int work_size = kernel_max_active_main_path_index(queued_kernel);
+    device_ptr d_queued_paths = queued_paths_.device_pointer;
+
+    int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
+
+    DeviceKernelArguments args(
+        &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
+
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
+    return;
+  }
+
  device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
  device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
  assert(d_counter != 0 && d_prefix_sum != 0);
@@ -552,7 +580,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
                               &d_prefix_sum,
                               &d_queued_kernel);

-    queue_->enqueue(kernel, work_size, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, work_size, args);
  }
 }

--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -70,9 +70,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);

  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel,
-                                   DeviceKernel queued_kernel,
-                                   const int num_paths_limit);
+  void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit);

  void compact_main_paths(const int num_active_paths);
  void compact_shadow_paths();
@@ -135,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  device_vector<int> integrator_shader_raytrace_sort_counter_;
  device_vector<int> integrator_shader_mnee_sort_counter_;
  device_vector<int> integrator_shader_sort_prefix_sum_;
+  device_vector<int> integrator_shader_sort_partition_key_offsets_;
  /* Path split. */
  device_vector<int> integrator_next_main_path_index_;
  device_vector<int> integrator_next_shadow_path_index_;
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -886,7 +886,7 @@ int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) c
 {
  /* Special trick for fast navigation: schedule multiple samples during fast navigation
   * (which will prefer to use lower resolution to keep up with refresh rate). This gives more
-   * usable visual feedback for artists. There are a couple of tricks though. */
+   * usable visual feedback for artists. */

  if (is_denoise_active_during_update()) {
    /* When denoising is used during navigation prefer using a higher resolution with less samples
@@ -896,25 +896,12 @@ int RenderScheduler::get_num_samples_during_navigation(int resolution_divider) c
    return 1;
  }

-  if (resolution_divider <= pixel_size_) {
-    /* When resolution divider is at or below pixel size, schedule one sample. This doesn't effect
-     * the sample count at this resolution division, but instead assists in the calculation of
-     * the resolution divider. */
-    return 1;
-  }
-
-  if (resolution_divider == pixel_size_ * 2) {
-    /* When resolution divider is the previous step to the final resolution, schedule two samples.
-     * This is so that rendering on lower resolution does not exceed time that it takes to render
-     * first sample at the full resolution. */
-    return 2;
-  }
-
-  /* Always render 4 samples, even if scene is configured for less.
-   * The idea here is to have enough information on the screen. Resolution divider of 2 allows us
-   * to have 4 time extra samples, so overall worst case timing is the same as the final resolution
-   * at one sample. */
-  return 4;
+  /* Schedule samples equal to the resolution divider up to a maximum of 4.
+   * The idea is to have enough information on the screen by increasing the sample count as the
+   * resolution is decreased. */
+  /* NOTE: Changing this formula will change the formula in
+   * `RenderScheduler::calculate_resolution_divider_for_time()`. */
+  return min(max(1, resolution_divider / pixel_size_), 4);
 }

 bool RenderScheduler::work_need_adaptive_filter() const
@@ -1100,9 +1087,10 @@ void RenderScheduler::update_start_resolution_divider()
  /* TODO(sergey): Need to add hysteresis to avoid resolution divider bouncing around when actual
   * render time is somewhere on a boundary between two resolutions. */

-  /* Never increase resolution to higher than the pixel size (which is possible if the scene is
-   * simple and compute device is fast). */
-  start_resolution_divider_ = max(resolution_divider_for_update, pixel_size_);
+  /* Don't let resolution drop below the desired one. It's better to be slow than provide an
+   * unreadable viewport render. */
+  start_resolution_divider_ = min(resolution_divider_for_update,
+                                  default_start_resolution_divider_);

  VLOG_WORK << "Calculated resolution divider is " << start_resolution_divider_;
 }
@@ -1187,24 +1175,24 @@ void RenderScheduler::check_time_limit_reached()

 int RenderScheduler::calculate_resolution_divider_for_time(double desired_time, double actual_time)
 {
-  /* TODO(sergey): There should a non-iterative analytical formula here. */
+  const double ratio_between_times = actual_time / desired_time;

-  int resolution_divider = 1;
+  /* We can pass `ratio_between_times` to `get_num_samples_during_navigation()` to get our
+   * navigation samples because the equation for calculating the resolution divider is as follows:
+   * `actual_time / desired_time = sqr(resolution_divider) / sample_count`.
+   * While `resolution_divider` is less than or equal to 4, `resolution_divider = sample_count`
+   * (This relationship is determined in `get_num_samples_during_navigation()`). With some
+   * substitution we end up with `actual_time / desired_time = resolution_divider` while the
+   * resolution divider is less than or equal to 4. Once the resolution divider increases above 4,
+   * the relationship of `actual_time / desired_time = resolution_divider` is no longer true,
+   * however the sample count retrieved from `get_num_samples_during_navigation()` is still
+   * accurate if we continue using this assumption. It should be noted that the interaction between
+   * `pixel_size`, sample count, and resolution divider are automatically accounted for and that's
+   * why `pixel_size` isn't included in any of the equations. */
+  const int navigation_samples = get_num_samples_during_navigation(
+      ceil_to_int(ratio_between_times));

-  /* This algorithm iterates through resolution dividers until a divider is found that achieves
-   * the desired render time. A limit of default_start_resolution_divider_ is put in place as the
-   * maximum resolution divider to avoid an unreadable viewport due to a low resolution.
-   * pre_resolution_division_samples and post_resolution_division_samples are used in this
-   * calculation to better predict the performance impact of changing resolution divisions as
-   * the sample count can also change between resolution divisions. */
-  while (actual_time > desired_time && resolution_divider < default_start_resolution_divider_) {
-    int pre_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
-    resolution_divider = resolution_divider * 2;
-    int post_resolution_division_samples = get_num_samples_during_navigation(resolution_divider);
-    actual_time /= 4.0 * pre_resolution_division_samples / post_resolution_division_samples;
-  }
-
-  return resolution_divider;
+  return ceil_to_int(sqrt(navigation_samples * ratio_between_times));
 }

 int calculate_resolution_divider_for_resolution(int width, int height, int resolution)
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -412,11 +412,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
  # warn for other versions
  if((CUDA_VERSION STREQUAL "101") OR
     (CUDA_VERSION STREQUAL "102") OR
-     (CUDA_VERSION_MAJOR STREQUAL "11"))
+     (CUDA_VERSION_MAJOR STREQUAL "11") OR
+     (CUDA_VERSION_MAJOR STREQUAL "12"))
  else()
    message(WARNING
      "CUDA version ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} detected, "
-      "build may succeed but only CUDA 11, 10.2 and 10.1 have been tested")
+      "build may succeed but only CUDA 12, 11, 10.2 and 10.1 have been tested")
  endif()

  # build for each arch
@@ -514,6 +515,16 @@ if(WITH_CYCLES_CUDA_BINARIES)
      else()
        message(STATUS "CUDA binaries for ${arch} require CUDA 10 or earlier, skipped.")
      endif()
+    elseif(${arch} MATCHES ".*_3.")
+      if(DEFINED CUDA11_NVCC_EXECUTABLE)
+        set(cuda_nvcc_executable ${CUDA11_NVCC_EXECUTABLE})
+        set(cuda_toolkit_root_dir ${CUDA11_TOOLKIT_ROOT_DIR})
+      elseif("${CUDA_VERSION}" LESS 120) # Support for sm_35, sm_37 was removed in CUDA 12
+        set(cuda_nvcc_executable ${CUDA_NVCC_EXECUTABLE})
+        set(cuda_toolkit_root_dir ${CUDA_TOOLKIT_ROOT_DIR})
+      else()
+        message(STATUS "CUDA binaries for ${arch} require CUDA 11 or earlier, skipped.")
+      endif()
    elseif(${arch} MATCHES ".*_7." AND "${CUDA_VERSION}" LESS 100)
      message(STATUS "CUDA binaries for ${arch} require CUDA 10.0+, skipped.")
    elseif(${arch} MATCHES ".*_8.")
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -661,7 +661,8 @@ ccl_device void bsdf_blur(KernelGlobals kg, ccl_private ShaderClosure *sc, float
 #endif
 }

-ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd, ccl_private const ShaderClosure *sc)
+ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd,
+                                       ccl_private const ShaderClosure *sc)
 {
  Spectrum albedo = sc->weight;
  /* Some closures include additional components such as Fresnel terms that cause their albedo to
@@ -685,7 +686,7 @@ ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd, ccl_pri
      albedo *= ((ccl_private const PrincipledSheenBsdf *)sc)->avg_value;
      break;
    case CLOSURE_BSDF_HAIR_PRINCIPLED_ID:
-      albedo *= bsdf_principled_hair_albedo(sc);
+      albedo *= bsdf_principled_hair_albedo(sd, sc);
      break;
    default:
      break;
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -478,10 +478,18 @@ ccl_device_inline float bsdf_principled_hair_albedo_roughness_scale(
  return (((((0.245f * x) + 5.574f) * x - 10.73f) * x + 2.532f) * x - 0.215f) * x + 5.969f;
 }

-ccl_device Spectrum bsdf_principled_hair_albedo(ccl_private const ShaderClosure *sc)
+ccl_device Spectrum bsdf_principled_hair_albedo(ccl_private const ShaderData *sd,
+                                                ccl_private const ShaderClosure *sc)
 {
  ccl_private PrincipledHairBSDF *bsdf = (ccl_private PrincipledHairBSDF *)sc;
-  return exp(-sqrt(bsdf->sigma) * bsdf_principled_hair_albedo_roughness_scale(bsdf->v));
+
+  const float cos_theta_o = cos_from_sin(dot(sd->wi, safe_normalize(sd->dPdu)));
+  const float cos_gamma_o = cos_from_sin(bsdf->extra->geom.w);
+  const float f = fresnel_dielectric_cos(cos_theta_o * cos_gamma_o, bsdf->eta);
+
+  const float roughness_scale = bsdf_principled_hair_albedo_roughness_scale(bsdf->v);
+  /* TODO(lukas): Adding the Fresnel term here as a workaround until the proper refactor. */
+  return exp(-sqrt(bsdf->sigma) * roughness_scale) + make_spectrum(f);
 }

 ccl_device_inline Spectrum
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -519,14 +519,6 @@ ccl_device int bsdf_microfacet_ggx_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }

-/* Required to maintain OSL interface. */
-ccl_device int bsdf_microfacet_ggx_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  return bsdf_microfacet_ggx_setup(bsdf);
-}
-
 ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
                                                 ccl_private const ShaderData *sd)
 {
@@ -613,14 +605,6 @@ ccl_device int bsdf_microfacet_beckmann_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }

-/* Required to maintain OSL interface. */
-ccl_device int bsdf_microfacet_beckmann_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  return bsdf_microfacet_beckmann_setup(bsdf);
-}
-
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ccl_private MicrofacetBsdf *bsdf)
 {
  bsdf->alpha_x = saturatef(bsdf->alpha_x);
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -90,8 +90,10 @@ ccl_device float schlick_fresnel(float u)
 }

 /* Calculate the fresnel color, which is a blend between white and the F0 color */
-ccl_device_forceinline Spectrum
-interpolate_fresnel_color(float3 L, float3 H, float ior, Spectrum F0)
+ccl_device_forceinline Spectrum interpolate_fresnel_color(float3 L,
+                                                          float3 H,
+                                                          float ior,
+                                                          Spectrum F0)
 {
  /* Compute the real Fresnel term and remap it from real_F0..1 to F0..1.
   * The reason why we use this remapping instead of directly doing the
--- a/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_avx2.cpp
@@ -10,7 +10,7 @@
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 #  define KERNEL_STUB
 #else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+/* SSE optimization disabled for now on 32 bit, see bug #36316. */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #    define __KERNEL_SSE__
 #    define __KERNEL_SSE2__
--- a/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse2.cpp
@@ -10,7 +10,7 @@
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 #  define KERNEL_STUB
 #else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+/* SSE optimization disabled for now on 32 bit, see bug #36316. */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #    define __KERNEL_SSE2__
 #  endif
--- a/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel_sse41.cpp
@@ -10,7 +10,7 @@
 #ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 #  define KERNEL_STUB
 #else
-/* SSE optimization disabled for now on 32 bit, see bug T36316. */
+/* SSE optimization disabled for now on 32 bit, see bug #36316. */
 #  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
 #    define __KERNEL_SSE2__
 #    define __KERNEL_SSE3__
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -401,6 +401,72 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
 }
 ccl_gpu_kernel_postfix

+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_bucket_pass(num_states,
+                                partition_size,
+                                max_shaders,
+                                kernel_index,
+                                d_queued_kernel,
+                                d_shader_sort_key,
+                                key_offsets,
+                                (threadgroup int *)threadgroup_array,
+                                metal_local_id,
+                                metal_local_size,
+                                metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_write_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_write_pass(num_states,
+                               partition_size,
+                               max_shaders,
+                               kernel_index,
+                               num_states_limit,
+                               indices,
+                               d_queued_kernel,
+                               d_shader_sort_key,
+                               key_offsets,
+                               (threadgroup int *)threadgroup_array,
+                               metal_local_id,
+                               metal_local_size,
+                               metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
 ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_paths_array,
                             int num_states,
@@ -579,7 +645,7 @@ ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgb
                                                          const int y,
                                                          const half4 half_pixel)
 {
-  /* Work around HIP issue with half float display, see T92972. */
+  /* Work around HIP issue with half float display, see #92972. */
 #ifdef __KERNEL_HIP__
  ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
  out[0] = half_pixel.x;
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -178,7 +178,7 @@ __device__
                                         simd_lane_index, \
                                         simd_group_index, \
                                         num_simd_groups, \
-                                         simdgroup_offset)
+                                         (threadgroup int *)threadgroup_array)
 #elif defined(__KERNEL_ONEAPI__)

 #  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -19,6 +19,115 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
+
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+
+#  define atomic_store_local(p, x) \
+    atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
+#  define atomic_load_local(p) \
+    atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
+
+ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
+                                                     const uint partition_size,
+                                                     const uint max_shaders,
+                                                     const uint queued_kernel,
+                                                     ccl_global ushort *d_queued_kernel,
+                                                     ccl_global uint *d_shader_sort_key,
+                                                     ccl_global int *partition_key_offsets,
+                                                     ccl_gpu_shared int *buckets,
+                                                     const ushort local_id,
+                                                     const ushort local_size,
+                                                     const ushort grid_id)
+{
+  /* Zero the bucket sizes. */
+  if (local_id < max_shaders) {
+    atomic_store_local(&buckets[local_id], 0);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Determine bucket sizes within the partitions. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      atomic_fetch_and_add_uint32(&buckets[key], 1);
+    }
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
+
+  if (local_id == 0) {
+    int offset = 0;
+    for (int i = 0; i < max_shaders; i++) {
+      partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
+      offset = offset + atomic_load_local(&buckets[i]);
+    }
+
+    /* Store the number of active states in this partition. */
+    partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
+  }
+}
+
+ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
+                                                    const uint partition_size,
+                                                    const uint max_shaders,
+                                                    const uint queued_kernel,
+                                                    const int num_states_limit,
+                                                    ccl_global int *indices,
+                                                    ccl_global ushort *d_queued_kernel,
+                                                    ccl_global uint *d_shader_sort_key,
+                                                    ccl_global int *partition_key_offsets,
+                                                    ccl_gpu_shared int *local_offset,
+                                                    const ushort local_id,
+                                                    const ushort local_size,
+                                                    const ushort grid_id)
+{
+  /* Calculate each partition's global offset from the prefix sum of the active state counts per
+   * partition. */
+
+  if (local_id < max_shaders) {
+    int partition_offset = 0;
+    for (int i = 0; i < uint(grid_id); i++) {
+      int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
+      partition_offset += partition_key_count;
+    }
+
+    ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
+    atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write the sorted active indices. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
+      if (index < num_states_limit) {
+        indices[index] = state_index;
+      }
+    }
+  }
+}
+
+#endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */

 template<typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint state_index,
--- a/intern/cycles/kernel/device/metal/bvh.h
+++ b/intern/cycles/kernel/device/metal/bvh.h
@@ -172,17 +172,14 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
    kernel_assert(!"Invalid ift_local");
    return false;
  }
-#  endif
-
-  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
-  metalrt_intersector_type metalrt_intersect;
-
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-
-  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
-  if (triangle_only) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  if (is_null_intersection_function_table(metal_ancillaries->ift_local_prim)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid ift_local_prim");
+    return false;
  }
+#  endif

  MetalRTIntersectionLocalPayload payload;
  payload.self = ray->self;
@@ -195,14 +192,48 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
  }
  payload.result = false;

-  typename metalrt_intersector_type::result_type intersection;
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);

 #  if defined(__METALRT_MOTION__)
+  metalrt_intersector_type metalrt_intersect;
+  typename metalrt_intersector_type::result_type intersection;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
+  if (triangle_only) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
  intersection = metalrt_intersect.intersect(
      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
 #  else
+
+  metalrt_blas_intersector_type metalrt_intersect;
+  typename metalrt_blas_intersector_type::result_type intersection;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
+  if (triangle_only) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  // if we know we are going to get max one hit, like for random-sss-walk we can
+  // optimize and accept the first hit
+  if (max_hits == 1) {
+    metalrt_intersect.accept_any_intersection(true);
+  }
+
+  int blas_index = metal_ancillaries->blas_userID_to_index_lookUp[local_object];
+  // transform the ray into object's local space
+  Transform itfm = kernel_data_fetch(objects, local_object).itfm;
+  r.origin = transform_point(&itfm, r.origin);
+  r.direction = transform_direction(&itfm, r.direction);
+
  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
+      r,
+      metal_ancillaries->blas_accel_structs[blas_index].blas,
+      metal_ancillaries->ift_local_prim,
+      payload);
 #  endif

  if (lcg_state) {
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -105,10 +105,11 @@ struct kernel_gpu_##name \
 { \
  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
  void run(thread MetalKernelContext& context, \
-           threadgroup int *simdgroup_offset, \
+           threadgroup atomic_int *threadgroup_array, \
           const uint metal_global_id, \
           const ushort metal_local_id, \
           const ushort metal_local_size, \
+           const ushort metal_grid_id, \
           uint simdgroup_size, \
           uint simd_lane_index, \
           uint simd_group_index, \
@@ -117,22 +118,24 @@ struct kernel_gpu_##name \
 kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
                                constant MetalAncillaries *_metal_ancillaries, \
-                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]], \
                                const uint metal_global_id [[thread_position_in_grid]], \
                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                const ushort metal_grid_id    [[threadgroup_position_in_grid]], \
                                uint simdgroup_size [[threads_per_simdgroup]], \
                                uint simd_lane_index [[thread_index_in_simdgroup]], \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+  params_struct->run(context, threadgroup_array, metal_global_id, metal_local_id, metal_local_size, metal_grid_id, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
 } \
 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
-                  threadgroup int *simdgroup_offset, \
+                  threadgroup atomic_int *threadgroup_array, \
                  const uint metal_global_id, \
                  const ushort metal_local_id, \
                  const ushort metal_local_size, \
+                  const ushort metal_grid_id, \
                  uint simdgroup_size, \
                  uint simd_lane_index, \
                  uint simd_group_index, \
@@ -263,13 +266,25 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,

 #  if defined(__METALRT_MOTION__)
 #    define METALRT_TAGS instancing, instance_motion, primitive_motion
+#    define METALRT_BLAS_TAGS , primitive_motion
 #  else
 #    define METALRT_TAGS instancing
+#    define METALRT_BLAS_TAGS
 #  endif /* __METALRT_MOTION__ */

 typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
 typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
 typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
+#  if defined(__METALRT_MOTION__)
+typedef acceleration_structure<primitive_motion> metalrt_blas_as_type;
+typedef intersection_function_table<triangle_data, primitive_motion> metalrt_blas_ift_type;
+typedef metal::raytracing::intersector<triangle_data, primitive_motion>
+    metalrt_blas_intersector_type;
+#  else
+typedef acceleration_structure<> metalrt_blas_as_type;
+typedef intersection_function_table<triangle_data> metalrt_blas_ift_type;
+typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_type;
+#  endif

 #endif /* __METALRT__ */

@@ -282,6 +297,12 @@ struct Texture3DParamsMetal {
  texture3d<float, access::sample> tex;
 };

+#ifdef __METALRT__
+struct MetalRTBlasWrapper {
+  metalrt_blas_as_type blas;
+};
+#endif
+
 struct MetalAncillaries {
  device Texture2DParamsMetal *textures_2d;
  device Texture3DParamsMetal *textures_3d;
@@ -291,6 +312,9 @@ struct MetalAncillaries {
  metalrt_ift_type ift_default;
  metalrt_ift_type ift_shadow;
  metalrt_ift_type ift_local;
+  metalrt_blas_ift_type ift_local_prim;
+  constant MetalRTBlasWrapper *blas_accel_structs;
+  constant int *blas_userID_to_index_lookUp;
 #endif
 };

--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -139,6 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
 #endif
 }

+[[intersection(triangle, triangle_data )]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_local_hit_tri_prim(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
+    uint primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
+{
+  //instance_id, aka the user_id has been removed. If we take this function we optimized the
+  //SSS for starting traversal from a primitive acceleration structure instead of the root of the global AS.
+  //this means we will always be intersecting the correct object no need for the userid to check
+  return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+      launch_params_metal, payload, payload.local_object, primitive_id, barycentrics, ray_tmax);
+}
 [[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
 __anyhit__cycles_metalrt_local_hit_tri(
    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
@@ -163,6 +177,17 @@ __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
  return result;
 }

+[[intersection(bounding_box, triangle_data )]] BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_local_hit_box_prim(const float ray_tmax [[max_distance]])
+{
+  /* unused function */
+  BoundingBoxIntersectionResult result;
+  result.distance = ray_tmax;
+  result.accept = false;
+  result.continue_search = false;
+  return result;
+}
+
 template<uint intersection_type>
 bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -372,6 +372,16 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
          break;
        }
+        case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
+          break;
+        }
        case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
          oneapi_call(kg,
                      cgh,
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -132,6 +132,9 @@ typedef struct IntegratorStateGPU {
  /* Index of main path which will be used by a next shadow catcher split.  */
  ccl_global int *next_main_path_index;

+  /* Partition/key offsets used when writing sorted active indices. */
+  ccl_global int *sort_partition_key_offsets;
+
  /* Divisor used to partition active indices by locality when sorting by material.  */
  uint sort_partition_divisor;
 } IntegratorStateGPU;
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -115,6 +115,13 @@ ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

@@ -130,6 +137,13 @@ ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

--- a/intern/cycles/kernel/osl/closures_setup.h
+++ b/intern/cycles/kernel/osl/closures_setup.h
@@ -209,14 +209,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
      closure->distribution == make_string("default", 4430693559278735917ull)) {
    if (!closure->refract) {
-      if (closure->alpha_x == closure->alpha_y) {
-        /* Isotropic */
-        sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
-      }
-      else {
-        /* Anisotropic */
-        sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
-      }
+      sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
    }
    else {
      sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
@@ -225,14 +218,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  /* Beckmann */
  else {
    if (!closure->refract) {
-      if (closure->alpha_x == closure->alpha_y) {
-        /* Isotropic */
-        sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
-      }
-      else {
-        /* Anisotropic */
-        sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
-      }
+      sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
    }
    else {
      sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
@@ -258,9 +244,9 @@ ccl_device void osl_closure_microfacet_ggx_setup(
  }

  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
-  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;

-  sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
+  sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 }

 ccl_device void osl_closure_microfacet_ggx_aniso_setup(
@@ -652,9 +638,9 @@ ccl_device void osl_closure_microfacet_beckmann_setup(
  }

  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
-  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;

-  sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
+  sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 }

 ccl_device void osl_closure_microfacet_beckmann_aniso_setup(
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -74,7 +74,8 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME__

 /* TODO: solve internal compiler errors and enable light tree on HIP. */
-#ifdef __KERNEL_HIP__
+/* TODO: solve internal compiler perf issue and enable light tree on Metal/AMD. */
+#if defined(__KERNEL_HIP__) || defined(__KERNEL_METAL_AMD__)
 #  undef __LIGHT_TREE__
 #endif

@@ -1508,6 +1509,8 @@ typedef enum DeviceKernel : int {
  DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
+  DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -13,6 +13,7 @@
 #include "scene/light.h"
 #include "scene/mesh.h"
 #include "scene/object.h"
+#include "scene/osl.h"
 #include "scene/pointcloud.h"
 #include "scene/scene.h"
 #include "scene/shader.h"
@@ -25,7 +26,6 @@

 #ifdef WITH_OSL
 #  include "kernel/osl/globals.h"
-#  include "kernel/osl/services.h"
 #endif

 #include "util/foreach.h"
@@ -1717,20 +1717,7 @@ void GeometryManager::device_update_displacement_images(Device *device,
  /* If any OSL node is used for displacement, it may reference a texture. But it's
   * unknown which ones, so have to load them all. */
  if (has_osl_node) {
-    set<OSLRenderServices *> services_shared;
-    device->foreach_device([&services_shared](Device *sub_device) {
-      OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
-      services_shared.insert(og->services);
-    });
-
-    for (OSLRenderServices *services : services_shared) {
-      for (auto it = services->textures.begin(); it != services->textures.end(); ++it) {
-        if (it->second->handle.get_manager() == image_manager) {
-          const int slot = it->second->handle.svm_slot();
-          bump_images.insert(slot);
-        }
-      }
-    }
+    OSLShaderManager::osl_image_slots(device, image_manager, bump_images);
  }
 #endif

--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -394,7 +394,7 @@ bool OSLShaderManager::osl_compile(const string &inputfile, const string &output

  /* Compile.
   *
-   * Mutex protected because the OSL compiler does not appear to be thread safe, see T92503. */
+   * Mutex protected because the OSL compiler does not appear to be thread safe, see #92503. */
  static thread_mutex osl_compiler_mutex;
  thread_scoped_lock lock(osl_compiler_mutex);

@@ -665,6 +665,27 @@ OSLNode *OSLShaderManager::osl_node(ShaderGraph *graph,
  return node;
 }

+/* Static function, so only this file needs to be compile with RTTT. */
+void OSLShaderManager::osl_image_slots(Device *device,
+                                       ImageManager *image_manager,
+                                       set<int> &image_slots)
+{
+  set<OSLRenderServices *> services_shared;
+  device->foreach_device([&services_shared](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    services_shared.insert(og->services);
+  });
+
+  for (OSLRenderServices *services : services_shared) {
+    for (auto it = services->textures.begin(); it != services->textures.end(); ++it) {
+      if (it->second->handle.get_manager() == image_manager) {
+        const int slot = it->second->handle.svm_slot();
+        image_slots.insert(slot);
+      }
+    }
+  }
+}
+
 /* Graph Compiler */

 OSLCompiler::OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *ss, Scene *scene)
--- a/intern/cycles/scene/osl.h
+++ b/intern/cycles/scene/osl.h
@@ -92,6 +92,9 @@ class OSLShaderManager : public ShaderManager {
                           const std::string &bytecode_hash = "",
                           const std::string &bytecode = "");

+  /* Get image slots used by OSL services on device. */
+  static void osl_image_slots(Device *device, ImageManager *image_manager, set<int> &image_slots);
+
 private:
  void texture_system_init();
  void texture_system_free();
--- a/intern/cycles/scene/shader.cpp
+++ b/intern/cycles/scene/shader.cpp
@@ -573,7 +573,7 @@ void ShaderManager::device_update_common(Device * /*device*/,
  kfilm->is_rec709 = is_rec709;
 }

-void ShaderManager::device_free_common(Device *, DeviceScene *dscene, Scene *scene)
+void ShaderManager::device_free_common(Device * /*device*/, DeviceScene *dscene, Scene * /*scene*/)
 {
  dscene->shaders.free();
 }
--- a/intern/cycles/session/tile.cpp
+++ b/intern/cycles/session/tile.cpp
@@ -520,7 +520,7 @@ bool TileManager::write_tile(const RenderBuffers &tile_buffers)
  /* If there is an overscan used for the tile copy pixels into single continuous block of memory
   * without any "gaps".
   * This is a workaround for bug in OIIO (https://github.com/OpenImageIO/oiio/pull/3176).
-   * Our task reference: T93008. */
+   * Our task reference: #93008. */
  if (tile_params.window_x || tile_params.window_y ||
      tile_params.window_width != tile_params.width ||
      tile_params.window_height != tile_params.height) {
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@@ -73,16 +73,55 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_s
  return new_value.float_value;
 }

-#    define atomic_fetch_and_add_uint32(p, x) \
-      atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
-#    define atomic_fetch_and_sub_uint32(p, x) \
-      atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
-#    define atomic_fetch_and_inc_uint32(p) \
-      atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
-#    define atomic_fetch_and_dec_uint32(p) \
-      atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
-#    define atomic_fetch_and_or_uint32(p, x) \
-      atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(device T *p, int x)
+{
+  return atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(device T *p, int x)
+{
+  return atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(device T *p)
+{
+  return atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(device T *p)
+{
+  return atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(device T *p, int x)
+{
+  return atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(threadgroup T *p)
+{
+  return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(threadgroup T *p)
+{
+  return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_or_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}

 ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
                                                      const float old_val,
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@@ -69,6 +69,9 @@ void DebugFlags::Metal::reset()
 {
  if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
    adaptive_compile = true;
+
+  if (auto str = getenv("CYCLES_METAL_LOCAL_ATOMIC_SORT"))
+    use_local_atomic_sort = (atoi(str) != 0);
 }

 DebugFlags::OptiX::OptiX()
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@@ -97,6 +97,9 @@ class DebugFlags {

    /* Whether adaptive feature based runtime compile is enabled or not. */
    bool adaptive_compile = false;
+
+    /* Whether local atomic sorting is enabled or not. */
+    bool use_local_atomic_sort = true;
  };

  /* Get instance of debug flags registry. */
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@@ -74,7 +74,7 @@ ccl_device float fast_sinf(float x)
   *
   * Results on: [-2pi,2pi].
   *
-   * Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
+   * Examined 2173837240 values of sin: 0.00662760244 avg ULP diff, 2 max ULP,
   * 1.19209e-07 max error
   */
  int q = fast_rint(x * M_1_PI_F);
@@ -256,11 +256,11 @@ ccl_device float fast_acosf(float x)
  /* clamp and crush denormals. */
  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
  /* Based on http://www.pouet.net/topic.php?which=9132&page=2
-   * 85% accurate (ulp 0)
+   * 85% accurate (ULP 0)
   * Examined 2130706434 values of acos:
-   *   15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
+   *   15.2000597 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // without "denormal crush"
   * Examined 2130706434 values of acos:
-   *   15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
+   *   15.2007108 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // with "denormal crush"
   */
  const float a = sqrtf(1.0f - m) *
                  (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
@@ -270,9 +270,8 @@ ccl_device float fast_acosf(float x)
 ccl_device float fast_asinf(float x)
 {
  /* Based on acosf approximation above.
-   * Max error is 4.51133e-05 (ulps are higher because we are consistently off
-   * by a little amount).
-   */
+   * Max error is 4.51133e-05 (ULPS are higher because we are consistently off
+   * by a little amount). */
  const float f = fabsf(x);
  /* Clamp and crush denormals. */
  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
@@ -290,9 +289,9 @@ ccl_device float fast_atanf(float x)
  const float t = s * s;
  /* http://mathforum.org/library/drmath/view/62672.html
   * Examined 4278190080 values of atan:
-   *   2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error      // (with  denormals)
+   *   2.36864877 avg ULP diff, 302 max ULP, 6.55651e-06 max error      // (with  denormals)
   * Examined 4278190080 values of atan:
-   *   171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
+   *   171160502 avg ULP diff, 855638016 max ULP, 6.55651e-06 max error // (crush denormals)
   */
  float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
  if (a > 1.0f) {
@@ -343,8 +342,8 @@ ccl_device float fast_log2f(float x)
  int exponent = (int)(bits >> 23) - 127;
  float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
  /* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
-   * 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error.
-   * ulp histogram:
+   * 0.0797524457 avg ULP diff, 3713596 max ULP, 7.62939e-06 max error.
+   * ULP histogram:
   *  0  = 97.46%
   *  1  =  2.29%
   *  2  =  0.11%
@@ -363,7 +362,7 @@ ccl_device float fast_log2f(float x)
 ccl_device_inline float fast_logf(float x)
 {
  /* Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]:
-   * 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error.
+   * 0.313865375 avg ULP diff, 5148137 max ULP, 7.62939e-06 max error.
   */
  return fast_log2f(x) * M_LN2_F;
 }
@@ -371,7 +370,7 @@ ccl_device_inline float fast_logf(float x)
 ccl_device_inline float fast_log10(float x)
 {
  /* Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]:
-   * 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error.
+   * 0.631237033 avg ULP diff, 4471615 max ULP, 3.8147e-06 max error.
   */
  return fast_log2f(x) * M_LN2_F / M_LN10_F;
 }
@@ -392,12 +391,12 @@ ccl_device float fast_exp2f(float x)
  /* Range reduction. */
  int m = (int)x;
  x -= m;
-  x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ulps!). */
+  x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ULPS!). */
  /* 5th degree polynomial generated with sollya
-   * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff,
-   * 232 max ulp.
+   * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ULP diff,
+   * 232 max ULP.
   *
-   * ulp histogram:
+   * ULP histogram:
   *  0  = 87.81%
   *  1  =  4.18%
   */
@@ -415,14 +414,14 @@ ccl_device float fast_exp2f(float x)
 ccl_device_inline float fast_expf(float x)
 {
  /* Examined 2237485550 values of exp on [-87.3300018,87.3300018]:
-   * 2.6666452 avg ulp diff, 230 max ulp.
+   * 2.6666452 avg ULP diff, 230 max ULP.
   */
  return fast_exp2f(x / M_LN2_F);
 }

 #if !defined(__KERNEL_GPU__) && !defined(_MSC_VER)
 /* MSVC seems to have a code-gen bug here in at least SSE41/AVX, see
- * T78047 and T78869 for details. Just disable for now, it only makes
+ * #78047 and #78869 for details. Just disable for now, it only makes
 * a small difference in denoising performance. */
 ccl_device float4 fast_exp2f4(float4 x)
 {
@@ -454,7 +453,7 @@ ccl_device_inline float4 fast_expf4(float4 x)
 ccl_device_inline float fast_exp10(float x)
 {
  /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
-   * 2.71732409 avg ulp diff, 232 max ulp.
+   * 2.71732409 avg ULP diff, 232 max ULP.
   */
  return fast_exp2f(x * M_LN10_F / M_LN2_F);
 }
@@ -475,7 +474,7 @@ ccl_device float fast_sinhf(float x)
  float a = fabsf(x);
  if (a > 1.0f) {
    /* Examined 53389559 values of sinh on [1,87.3300018]:
-     * 33.6886442 avg ulp diff, 178 max ulp. */
+     * 33.6886442 avg ULP diff, 178 max ULP. */
    float e = fast_expf(a);
    return copysignf(0.5f * e - 0.5f / e, x);
  }
@@ -495,7 +494,7 @@ ccl_device float fast_sinhf(float x)
 ccl_device_inline float fast_coshf(float x)
 {
  /* Examined 2237485550 values of cosh on [-87.3300018,87.3300018]:
-   * 1.78256726 avg ulp diff, 178 max ulp.
+   * 1.78256726 avg ULP diff, 178 max ULP.
   */
  float e = fast_expf(fabsf(x));
  return 0.5f * e + 0.5f / e;
@@ -506,7 +505,7 @@ ccl_device_inline float fast_tanhf(float x)
  /* Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]:
   * 3.12924e-06 max error.
   */
-  /* NOTE: ulp error is high because of sub-optimal handling around the origin. */
+  /* NOTE: ULP error is high because of sub-optimal handling around the origin. */
  float e = fast_expf(2.0f * fabsf(x));
  return copysignf(1.0f - 2.0f / (1.0f + e), x);
 }
@@ -579,7 +578,7 @@ ccl_device_inline float fast_erfcf(float x)
 {
  /* Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error.
   *
-   * ulp histogram:
+   * ULP histogram:
   *
   *  0  = 80.30%
   */
--- a/intern/ghost/intern/GHOST_ContextCGL.mm
+++ b/intern/ghost/intern/GHOST_ContextCGL.mm
@@ -516,7 +516,7 @@ GHOST_TSuccess GHOST_ContextCGL::releaseNativeHandles()

 /* OpenGL on Metal
 *
- * Use Metal layer to avoid Viewport lagging on macOS, see T60043. */
+ * Use Metal layer to avoid Viewport lagging on macOS, see #60043. */

 static const MTLPixelFormat METAL_FRAMEBUFFERPIXEL_FORMAT = MTLPixelFormatBGRA8Unorm;
 static const OSType METAL_CORE_VIDEO_PIXEL_FORMAT = kCVPixelFormatType_32BGRA;
--- a/intern/ghost/intern/GHOST_ContextGLX.cpp
+++ b/intern/ghost/intern/GHOST_ContextGLX.cpp
@@ -141,7 +141,7 @@ GHOST_TSuccess GHOST_ContextGLX::initializeDrawingContext()
  /* -------------------------------------------------------------------- */
 #else
  /* Important to initialize only GLXEW (_not_ GLEW),
-   * since this breaks w/ Mesa's `swrast`, see: T46431. */
+   * since this breaks w/ Mesa's `swrast`, see: #46431. */
  glxewInit();
 #endif /* USE_GLXEW_INIT_WORKAROUND */

--- a/intern/ghost/intern/GHOST_NDOFManager.cpp
+++ b/intern/ghost/intern/GHOST_NDOFManager.cpp
@@ -302,7 +302,7 @@ bool GHOST_NDOFManager::setDevice(ushort vendor_id, ushort product_id)
      switch (product_id) {
        case 0xC62E: /* Plugged in. */
        case 0xC62F: /* Wireless. */
-        case 0xC658: /* Wireless (3DConnexion Universal Wireless Receiver in WIN32), see T82412. */
+        case 0xC658: /* Wireless (3DConnexion Universal Wireless Receiver in WIN32), see #82412. */
        {
          device_type_ = NDOF_SpaceMouseWireless;
          hid_map_button_num_ = 2;
@@ -341,7 +341,7 @@ bool GHOST_NDOFManager::setDevice(ushort vendor_id, ushort product_id)
    hid_map_button_mask_ = int(~(UINT_MAX << hid_map_button_num_));
  }

-  CLOG_INFO(LOG, 2, "%d buttons -> hex:%X", hid_map_button_num_, (uint)hid_map_button_mask_);
+  CLOG_INFO(LOG, 2, "%d buttons -> hex:%X", hid_map_button_num_, uint(hid_map_button_mask_));

  return device_type_ != NDOF_UnknownDevice;
 }
@@ -445,14 +445,14 @@ void GHOST_NDOFManager::updateButton(int button_number, bool press, uint64_t tim
              2,
              "button=%d, press=%d (out of range %d, ignoring!)",
              button_number,
-              (int)press,
+              int(press),
              hid_map_button_num_);
    return;
  }
  const NDOF_ButtonT button = hid_map_[button_number];
  if (button == NDOF_BUTTON_NONE) {
    CLOG_INFO(
-        LOG, 2, "button=%d, press=%d (mapped to none, ignoring!)", button_number, (int)press);
+        LOG, 2, "button=%d, press=%d (mapped to none, ignoring!)", button_number, int(press));
    return;
  }

@@ -460,7 +460,7 @@ void GHOST_NDOFManager::updateButton(int button_number, bool press, uint64_t tim
            2,
            "button=%d, press=%d, name=%s",
            button_number,
-            (int)press,
+            int(press),
            ndof_button_names[button]);

  GHOST_IWindow *window = system_.getWindowManager()->getActiveWindow();
--- a/intern/ghost/intern/GHOST_SystemSDL.cpp
+++ b/intern/ghost/intern/GHOST_SystemSDL.cpp
@@ -602,7 +602,7 @@ void GHOST_SystemSDL::processEvent(SDL_Event *sdl_event)
      /* NOTE: the `sdl_sub_evt.keysym.sym` is truncated,
       * for unicode support ghost has to be modified. */

-      /* TODO(@campbellbarton): support full unicode, SDL supports this but it needs to be
+      /* TODO(@ideasman42): support full unicode, SDL supports this but it needs to be
       * explicitly enabled via #SDL_StartTextInput which GHOST would have to wrap. */
      char utf8_buf[sizeof(GHOST_TEventKeyData::utf8_buf)] = {'\0'};
      if (type == GHOST_kEventKeyDown) {
--- a/intern/ghost/intern/GHOST_SystemWayland.cpp
+++ b/intern/ghost/intern/GHOST_SystemWayland.cpp
@@ -82,6 +82,8 @@
 #include "CLG_log.h"

 #ifdef USE_EVENT_BACKGROUND_THREAD
+#  include "GHOST_TimerTask.h"
+
 #  include <pthread.h>
 #endif

@@ -139,7 +141,7 @@ constexpr size_t events_pending_default_size = 4096 / sizeof(void *);
 * \{ */

 /**
- * GNOME (mutter 42.2 had a bug with confine not respecting scale - Hi-DPI), See: T98793.
+ * GNOME (mutter 42.2 had a bug with confine not respecting scale - Hi-DPI), See: #98793.
 * Even though this has been fixed, at time of writing it's not yet in a release.
 * Workaround the problem by implementing confine with a software cursor.
 * While this isn't ideal, it's not adding a lot of overhead as software
@@ -174,7 +176,7 @@ static bool use_gnome_confine_hack = false;

 /**
 * KDE (plasma 5.26.1) has a bug where the cursor surface needs to be committed
- * (via `wl_surface_commit`) when it was hidden and is being set to visible again, see: T102048.
+ * (via `wl_surface_commit`) when it was hidden and is being set to visible again, see: #102048.
 * See: https://bugs.kde.org/show_bug.cgi?id=461001
 */
 #define USE_KDE_TABLET_HIDDEN_CURSOR_HACK
@@ -195,8 +197,8 @@ static bool use_gnome_confine_hack = false;
 * \{ */

 /**
- * Fix short-cut part of keyboard reading code not properly handling some keys, see: T102194.
- * \note This is similar to X11 workaround by the same name, see: T47228.
+ * Fix short-cut part of keyboard reading code not properly handling some keys, see: #102194.
+ * \note This is similar to X11 workaround by the same name, see: #47228.
 */
 #define USE_NON_LATIN_KB_WORKAROUND

@@ -239,7 +241,7 @@ enum {
  BTN_STYLUS = 0x14b,
  /** Use as right-mouse. */
  BTN_STYLUS2 = 0x14c,
-  /** NOTE(@campbellbarton): Map to an additional button (not sure which hardware uses this). */
+  /** NOTE(@ideasman42): Map to an additional button (not sure which hardware uses this). */
  BTN_STYLUS3 = 0x149,
 };

@@ -768,7 +770,12 @@ struct GWL_Seat {
    int32_t rate = 0;
    /** Time (milliseconds) after which to start repeating keys. */
    int32_t delay = 0;
-    /** Timer for key repeats. */
+    /**
+     * Timer for key repeats.
+     *
+     * \note For as long as #USE_EVENT_BACKGROUND_THREAD is defined, any access to this
+     * (including null checks, must lock `timer_mutex` first.
+     */
    GHOST_ITimerTask *timer = nullptr;
  } key_repeat;

@@ -832,6 +839,42 @@ static bool gwl_seat_key_depressed_suppress_warning(const GWL_Seat *seat)
  return suppress_warning;
 }

+/**
+ * \note Caller must lock `timer_mutex`.
+ */
+static void gwl_seat_key_repeat_timer_add(GWL_Seat *seat,
+                                          GHOST_TimerProcPtr key_repeat_fn,
+                                          GHOST_TUserDataPtr payload,
+                                          const bool use_delay)
+{
+  GHOST_SystemWayland *system = seat->system;
+  const uint64_t time_step = 1000 / seat->key_repeat.rate;
+  const uint64_t time_start = use_delay ? seat->key_repeat.delay : time_step;
+#ifdef USE_EVENT_BACKGROUND_THREAD
+  GHOST_TimerTask *timer = new GHOST_TimerTask(
+      system->getMilliSeconds() + time_start, time_step, key_repeat_fn, payload);
+  seat->key_repeat.timer = timer;
+  system->ghost_timer_manager()->addTimer(timer);
+#else
+  seat->key_repeat.timer = system->installTimer(time_start, time_step, key_repeat_fn, payload);
+#endif
+}
+
+/**
+ * \note The caller must lock `timer_mutex`.
+ */
+static void gwl_seat_key_repeat_timer_remove(GWL_Seat *seat)
+{
+  GHOST_SystemWayland *system = seat->system;
+#ifdef USE_EVENT_BACKGROUND_THREAD
+  system->ghost_timer_manager()->removeTimer(
+      static_cast<GHOST_TimerTask *>(seat->key_repeat.timer));
+#else
+  system->removeTimer(seat->key_repeat.timer);
+#endif
+  seat->key_repeat.timer = nullptr;
+}
+
 /** \} */

 /* -------------------------------------------------------------------- */
@@ -873,7 +916,7 @@ struct GWL_Display {
   * The main purpose of having an active seat is an alternative from always using the first
   * seat which prevents events from any other seat.
   *
-   * NOTE(@campbellbarton): This could be extended and developed further extended to support
+   * NOTE(@ideasman42): This could be extended and developed further extended to support
   * an active seat per window (for e.g.), basic support is sufficient for now as currently isn't
   * a widely used feature.
   */
@@ -906,6 +949,16 @@ struct GWL_Display {
  /** Guard against multiple threads accessing `events_pending` at once. */
  std::mutex events_pending_mutex;

+  /**
+   * A separate timer queue, needed so the WAYLAND thread can lock access.
+   * Using the system's #GHOST_Sysem::getTimerManager is not thread safe because
+   * access to the timer outside of WAYLAND specific logic will not lock.
+   *
+   * Needed because #GHOST_System::dispatchEvents fires timers
+   * outside of WAYLAND (without locking the `timer_mutex`).
+   */
+  GHOST_TimerManager *ghost_timer_manager = nullptr;
+
 #endif /* USE_EVENT_BACKGROUND_THREAD */
 };

@@ -959,6 +1012,13 @@ static void gwl_display_destroy(GWL_Display *display)
    gwl_display_event_thread_destroy(display);
    display->system->server_mutex->unlock();
  }
+
+  /* Important to remove after the seats which may have key repeat timers active. */
+  if (display->ghost_timer_manager) {
+    delete display->ghost_timer_manager;
+    display->ghost_timer_manager = nullptr;
+  }
+
 #endif /* USE_EVENT_BACKGROUND_THREAD */

  if (display->wl_display) {
@@ -1179,7 +1239,7 @@ static void gwl_registry_entry_remove_all(GWL_Display *display)
 {
  const bool on_exit = true;

-  /* NOTE(@campbellbarton): Free by slot instead of simply looping over
+  /* NOTE(@ideasman42): Free by slot instead of simply looping over
   * `display->registry_entry` so the order of freeing is always predictable.
   * Otherwise global objects would be feed in the order they are registered.
   * While this works in my tests, it could cause difficult to reproduce bugs
@@ -1209,7 +1269,7 @@ static void gwl_registry_entry_remove_all(GWL_Display *display)
 * so there is no reason to update all other outputs that an output was removed (for e.g.).
 * Pass as -1 to update all slots.
 *
- * NOTE(@campbellbarton): Updating all other items on a single change is typically worth avoiding.
+ * NOTE(@ideasman42): Updating all other items on a single change is typically worth avoiding.
 * In practice this isn't a problem as so there are so few elements in `display->registry_entry`,
 * so few use update functions and adding/removal at runtime is rarely called (plugging/unplugging)
 * hardware for e.g. So while it's possible to store dependency links to avoid unnecessary
@@ -1258,7 +1318,7 @@ static void ghost_wl_display_report_error(struct wl_display *display)
    fprintf(stderr, "The Wayland connection experienced a fatal error: %s\n", strerror(ecode));
  }

-  /* NOTE(@campbellbarton): The application is running,
+  /* NOTE(@ideasman42): The application is running,
   * however an error closes all windows and most importantly:
   * shuts down the GPU context (loosing all GPU state - shaders, bind codes etc),
   * so recovering from this effectively involves restarting.
@@ -1268,7 +1328,7 @@ static void ghost_wl_display_report_error(struct wl_display *display)
   * So in practice re-connecting to the display server isn't an option.
   *
   * Exit since leaving the process open will simply flood the output and do nothing.
-   * Although as the process is in a valid state, auto-save for e.g. is possible, see: T100855. */
+   * Although as the process is in a valid state, auto-save for e.g. is possible, see: #100855. */
  ::exit(-1);
 }

@@ -1382,7 +1442,7 @@ static GHOST_TKey xkb_map_gkey(const xkb_keysym_t sym)

      /* Additional keys for non US layouts. */

-      /* Uses the same physical key as #XKB_KEY_KP_Decimal for QWERTZ layout, see: T102287. */
+      /* Uses the same physical key as #XKB_KEY_KP_Decimal for QWERTZ layout, see: #102287. */
      GXMAP(gkey, XKB_KEY_KP_Separator, GHOST_kKeyNumpadPeriod);

      default:
@@ -2910,7 +2970,7 @@ static void gesture_pinch_handle_begin(void *data,
  if (wl_surface *wl_surface_focus = seat->pointer.wl_surface_window) {
    win = ghost_wl_surface_user_data(wl_surface_focus);
  }
-  /* NOTE(@campbellbarton): Blender's use of track-pad coordinates is inconsistent and needs work.
+  /* NOTE(@ideasman42): Blender's use of track-pad coordinates is inconsistent and needs work.
   * This isn't specific to WAYLAND, in practice they tend to work well enough in most cases.
   * Some operators scale by the UI scale, some don't.
   * Even this window scale is not correct because it doesn't account for:
@@ -2924,7 +2984,7 @@ static void gesture_pinch_handle_begin(void *data,
   */
  const wl_fixed_t win_scale = win ? win->scale() : 1;

-  /* NOTE(@campbellbarton): Scale factors match Blender's operators & default preferences.
+  /* NOTE(@ideasman42): Scale factors match Blender's operators & default preferences.
   * For these values to work correctly, operator logic will need to be changed not to scale input
   * by the region size (as with 3D view zoom) or preference for 3D view orbit sensitivity.
   *
@@ -3087,7 +3147,7 @@ static const struct zwp_pointer_gesture_swipe_v1_listener gesture_swipe_listener
 /* -------------------------------------------------------------------- */
 /** \name Listener (Touch Seat), #wl_touch_listener
 *
- * NOTE(@campbellbarton): It's not clear if this interface is used by popular compositors.
+ * NOTE(@ideasman42): It's not clear if this interface is used by popular compositors.
 * It looks like GNOME/KDE only support `zwp_pointer_gestures_v1_interface`.
 * If this isn't used anywhere, it could be removed.
 * \{ */
@@ -3718,9 +3778,14 @@ static void keyboard_handle_leave(void *data,
  GWL_Seat *seat = static_cast<GWL_Seat *>(data);
  seat->keyboard.wl_surface_window = nullptr;

-  /* Losing focus must stop repeating text. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_cancel(seat);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    /* Losing focus must stop repeating text. */
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_cancel(seat);
+    }
  }

 #ifdef USE_GNOME_KEYBOARD_SUPPRESS_WARNING
@@ -3743,9 +3808,9 @@ static xkb_keysym_t xkb_state_key_get_one_sym_without_modifiers(
  /* Use an empty keyboard state to access key symbol without modifiers. */
  xkb_keysym_t sym = xkb_state_key_get_one_sym(xkb_state_empty, key);

-  /* NOTE(@campbellbarton): Only perform the number-locked lookup as a fallback
+  /* NOTE(@ideasman42): Only perform the number-locked lookup as a fallback
   * when a number-pad key has been pressed. This is important as some key-maps use number lock
-   * for switching other layers (in particular `de(neo_qwertz)` turns on layer-4), see: T96170.
+   * for switching other layers (in particular `de(neo_qwertz)` turns on layer-4), see: #96170.
   * Alternative solutions could be to inspect the layout however this could get involved
   * and turning on the number-lock is only needed for a limited set of keys. */

@@ -3780,36 +3845,32 @@ static xkb_keysym_t xkb_state_key_get_one_sym_without_modifiers(
  return sym;
 }

+/**
+ * \note Caller must lock `timer_mutex`.
+ */
 static void keyboard_handle_key_repeat_cancel(GWL_Seat *seat)
 {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
  GHOST_ASSERT(seat->key_repeat.timer != nullptr, "Caller much check for timer");
  delete static_cast<GWL_KeyRepeatPlayload *>(seat->key_repeat.timer->getUserData());
-  seat->system->removeTimer(seat->key_repeat.timer);
-  seat->key_repeat.timer = nullptr;
+
+  gwl_seat_key_repeat_timer_remove(seat);
 }

 /**
 * Restart the key-repeat timer.
 * \param use_delay: When false, use the interval
 * (prevents pause when the setting changes while the key is held).
+ *
+ * \note Caller must lock `timer_mutex`.
 */
 static void keyboard_handle_key_repeat_reset(GWL_Seat *seat, const bool use_delay)
 {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
  GHOST_ASSERT(seat->key_repeat.timer != nullptr, "Caller much check for timer");
-  GHOST_SystemWayland *system = seat->system;
-  GHOST_ITimerTask *timer = seat->key_repeat.timer;
-  GHOST_TimerProcPtr key_repeat_fn = timer->getTimerProc();
+  GHOST_TimerProcPtr key_repeat_fn = seat->key_repeat.timer->getTimerProc();
  GHOST_TUserDataPtr payload = seat->key_repeat.timer->getUserData();
-  seat->system->removeTimer(seat->key_repeat.timer);
-  const uint64_t time_step = 1000 / seat->key_repeat.rate;
-  const uint64_t time_start = use_delay ? seat->key_repeat.delay : time_step;
-  seat->key_repeat.timer = system->installTimer(time_start, time_step, key_repeat_fn, payload);
+
+  gwl_seat_key_repeat_timer_remove(seat);
+  gwl_seat_key_repeat_timer_add(seat, key_repeat_fn, payload, use_delay);
 }

 static void keyboard_handle_key(void *data,
@@ -3848,6 +3909,11 @@ static void keyboard_handle_key(void *data,
      break;
  }

+#ifdef USE_EVENT_BACKGROUND_THREAD
+  /* Any access to `seat->key_repeat.timer` must lock. */
+  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+
  struct GWL_KeyRepeatPlayload *key_repeat_payload = nullptr;

  /* Delete previous timer. */
@@ -3872,7 +3938,7 @@ static void keyboard_handle_key(void *data,
      else {
        /* Key-up from keys that were not repeating cause the repeat timer to pause.
         *
-         * NOTE(@campbellbarton): This behavior isn't universal, some text input systems will
+         * NOTE(@ideasman42): This behavior isn't universal, some text input systems will
         * stop the repeat entirely. Choose to pause repeat instead as this is what GTK/WIN32 do,
         * and it fits better for keyboard input that isn't related to text entry. */
        timer_action = RESET;
@@ -3886,23 +3952,14 @@ static void keyboard_handle_key(void *data,
        break;
      }
      case RESET: {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-        std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
        /* The payload will be added again. */
-        seat->system->removeTimer(seat->key_repeat.timer);
-        seat->key_repeat.timer = nullptr;
+        gwl_seat_key_repeat_timer_remove(seat);
        break;
      }
      case CANCEL: {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-        std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
        delete key_repeat_payload;
        key_repeat_payload = nullptr;
-
-        seat->system->removeTimer(seat->key_repeat.timer);
-        seat->key_repeat.timer = nullptr;
+        gwl_seat_key_repeat_timer_remove(seat);
        break;
      }
    }
@@ -3956,8 +4013,8 @@ static void keyboard_handle_key(void *data,
                                                           utf8_buf));
      }
    };
-    seat->key_repeat.timer = seat->system->installTimer(
-        seat->key_repeat.delay, 1000 / seat->key_repeat.rate, key_repeat_fn, key_repeat_payload);
+
+    gwl_seat_key_repeat_timer_add(seat, key_repeat_fn, key_repeat_payload, true);
  }
 }

@@ -3982,8 +4039,13 @@ static void keyboard_handle_modifiers(void *data,

  /* A modifier changed so reset the timer,
   * see comment in #keyboard_handle_key regarding this behavior. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_reset(seat, true);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_reset(seat, true);
+    }
  }

 #ifdef USE_GNOME_KEYBOARD_SUPPRESS_WARNING
@@ -4002,9 +4064,14 @@ static void keyboard_repeat_handle_info(void *data,
  seat->key_repeat.rate = rate;
  seat->key_repeat.delay = delay;

-  /* Unlikely possible this setting changes while repeating. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_reset(seat, false);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    /* Unlikely possible this setting changes while repeating. */
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_reset(seat, false);
+    }
  }
 }

@@ -4275,8 +4342,14 @@ static void gwl_seat_capability_keyboard_disable(GWL_Seat *seat)
  if (!seat->wl_keyboard) {
    return;
  }
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_cancel(seat);
+
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_cancel(seat);
+    }
  }
  wl_keyboard_destroy(seat->wl_keyboard);
  seat->wl_keyboard = nullptr;
@@ -4394,7 +4467,7 @@ static void xdg_output_handle_logical_size(void *data,

 #ifdef USE_GNOME_CONFINE_HACK
      /* Use a bug in GNOME to check GNOME is in use. If the bug is fixed this won't cause an issue
-       * as T98793 has been fixed up-stream too, but not in a release at time of writing. */
+       * as #98793 has been fixed up-stream too, but not in a release at time of writing. */
      use_gnome_confine_hack = true;
 #endif

@@ -5411,6 +5484,8 @@ GHOST_SystemWayland::GHOST_SystemWayland(bool background)

 #ifdef USE_EVENT_BACKGROUND_THREAD
  gwl_display_event_thread_create(display_);
+
+  display_->ghost_timer_manager = new GHOST_TimerManager();
 #endif
 }

@@ -5491,10 +5566,16 @@ bool GHOST_SystemWayland::processEvents(bool waitForEvent)
 #endif /* USE_EVENT_BACKGROUND_THREAD */

  {
+    const uint64_t now = getMilliSeconds();
 #ifdef USE_EVENT_BACKGROUND_THREAD
-    std::lock_guard lock_timer_guard{*display_->system->timer_mutex};
+    {
+      std::lock_guard lock_timer_guard{*display_->system->timer_mutex};
+      if (ghost_timer_manager()->fireTimers(now)) {
+        any_processed = true;
+      }
+    }
 #endif
-    if (getTimerManager()->fireTimers(getMilliSeconds())) {
+    if (getTimerManager()->fireTimers(now)) {
      any_processed = true;
    }
  }
@@ -6717,6 +6798,13 @@ struct wl_shm *GHOST_SystemWayland::wl_shm() const
  return display_->wl_shm;
 }

+#ifdef USE_EVENT_BACKGROUND_THREAD
+GHOST_TimerManager *GHOST_SystemWayland::ghost_timer_manager()
+{
+  return display_->ghost_timer_manager;
+}
+#endif
+
 /** \} */

 /* -------------------------------------------------------------------- */
@@ -6949,7 +7037,7 @@ bool GHOST_SystemWayland::window_cursor_grab_set(const GHOST_TGrabCursorMode mod
                                                         UNPACK2(xy_next));
          wl_surface_commit(wl_surface);

-          /* NOTE(@campbellbarton): The new cursor position is a hint,
+          /* NOTE(@ideasman42): The new cursor position is a hint,
           * it's possible the hint is ignored. It doesn't seem like there is a good way to
           * know if the hint will be used or not, at least not immediately. */
          xy_motion[0] = xy_next[0];
@@ -6992,7 +7080,7 @@ bool GHOST_SystemWayland::window_cursor_grab_set(const GHOST_TGrabCursorMode mod
  if (mode != GHOST_kGrabDisable) {
    if (grab_state_next.use_lock) {
      if (!grab_state_prev.use_lock) {
-        /* TODO(@campbellbarton): As WAYLAND does not support warping the pointer it may not be
+        /* TODO(@ideasman42): As WAYLAND does not support warping the pointer it may not be
         * possible to support #GHOST_kGrabWrap by pragmatically settings it's coordinates.
         * An alternative could be to draw the cursor in software (and hide the real cursor),
         * or just accept a locked cursor on WAYLAND. */
--- a/intern/ghost/intern/GHOST_SystemWayland.h
+++ b/intern/ghost/intern/GHOST_SystemWayland.h
@@ -165,6 +165,16 @@ class GHOST_SystemWayland : public GHOST_System {

  bool cursor_grab_use_software_display_get(const GHOST_TGrabCursorMode mode);

+#ifdef USE_EVENT_BACKGROUND_THREAD
+  /**
+   * Return a separate WAYLAND local timer manager to #GHOST_System::getTimerManager
+   * Manipulation & access must lock with #GHOST_WaylandSystem::server_mutex.
+   *
+   * See #GWL_Display::ghost_timer_manager doc-string for details on why this is needed.
+   */
+  GHOST_TimerManager *ghost_timer_manager();
+#endif
+
  /* WAYLAND direct-data access. */

  struct wl_display *wl_display();
@@ -198,7 +208,7 @@ class GHOST_SystemWayland : public GHOST_System {
   * Clear all references to this output.
   *
   * \note The compositor should have already called the `wl_surface_listener.leave` callback,
-   * however some compositors may not (see T103586).
+   * however some compositors may not (see #103586).
   * So remove references to the output before it's destroyed to avoid crashing.
   *
   * \return true when any references were removed.
@@ -233,7 +243,14 @@ class GHOST_SystemWayland : public GHOST_System {
   * from running at the same time. */
  std::mutex *server_mutex = nullptr;

-  /** Threads must lock this before manipulating timers. */
+  /**
+   * Threads must lock this before manipulating #GWL_Display::ghost_timer_manager.
+   *
+   * \note Using a separate lock to `server_mutex` is necessary because the
+   * server lock is already held when calling `ghost_wl_display_event_pump`.
+   * If manipulating the timer used the `server_mutex`, event pump can indirectly
+   * handle key up/down events which would lock `server_mutex` causing a dead-lock.
+   */
  std::mutex *timer_mutex = nullptr;

  std::thread::id main_thread_id;
--- a/intern/ghost/intern/GHOST_SystemWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemWin32.cpp
@@ -424,10 +424,9 @@ bool GHOST_SystemWin32::processEvents(bool waitForEvent)

    processTrackpad();

-    /* PeekMessage above is allowed to dispatch messages to the wndproc without us
+    /* `PeekMessage` above is allowed to dispatch messages to the `wndproc` without us
     * noticing, so we need to check the event manager here to see if there are
-     * events waiting in the queue.
-     */
+     * events waiting in the queue. */
    hasEventHandled |= this->m_eventManager->getNumEvents() > 0;

  } while (waitForEvent && !hasEventHandled);
@@ -566,8 +565,8 @@ GHOST_TKey GHOST_SystemWin32::hardKey(RAWINPUT const &raw, bool *r_key_down)
 /**
 * \note this function can be extended to include other exotic cases as they arise.
 *
- * This function was added in response to bug T25715.
- * This is going to be a long list T42426.
+ * This function was added in response to bug #25715.
+ * This is going to be a long list #42426.
 */
 GHOST_TKey GHOST_SystemWin32::processSpecialKey(short vKey, short scanCode) const
 {
@@ -1080,11 +1079,11 @@ GHOST_EventCursor *GHOST_SystemWin32::processCursorEvent(GHOST_WindowWin32 *wind
      if (window->getCursorGrabMode() == GHOST_kGrabHide) {
        window->getClientBounds(bounds);

-        /* WARNING(@campbellbarton): The current warping logic fails to warp on every event,
+        /* WARNING(@ideasman42): The current warping logic fails to warp on every event,
         * so the box needs to small enough not to let the cursor escape the window but large
         * enough that the cursor isn't being warped every time.
         * If this was not the case it would be less trouble to simply warp the cursor to the
-         * center of the screen on every motion, see: D16558 (alternative fix for T102346). */
+         * center of the screen on every motion, see: D16558 (alternative fix for #102346). */
        const int32_t subregion_div = 4; /* One quarter of the region. */
        const int32_t size[2] = {bounds.getWidth(), bounds.getHeight()};
        const int32_t center[2] = {(bounds.m_l + bounds.m_r) / 2, (bounds.m_t + bounds.m_b) / 2};
@@ -1179,7 +1178,7 @@ GHOST_EventKey *GHOST_SystemWin32::processKeyEvent(GHOST_WindowWin32 *window, RA
  GHOST_TKey key = system->hardKey(raw, &key_down);
  GHOST_EventKey *event;

-  /* NOTE(@campbellbarton): key repeat in WIN32 also applies to modifier-keys.
+  /* NOTE(@ideasman42): key repeat in WIN32 also applies to modifier-keys.
   * Check for this case and filter out modifier-repeat.
   * Typically keyboard events are *not* filtered as part of GHOST's event handling.
   * As other GHOST back-ends don't have the behavior, it's simplest not to send them through.
@@ -1210,7 +1209,7 @@ GHOST_EventKey *GHOST_SystemWin32::processKeyEvent(GHOST_WindowWin32 *window, RA
    const bool ctrl_pressed = has_state && state[VK_CONTROL] & 0x80;
    const bool alt_pressed = has_state && state[VK_MENU] & 0x80;

-    /* We can be here with !key_down if processing dead keys (diacritics). See T103119. */
+    /* We can be here with !key_down if processing dead keys (diacritics). See #103119. */

    /* No text with control key pressed (Alt can be used to insert special characters though!). */
    if (ctrl_pressed && !alt_pressed) {
--- a/intern/ghost/intern/GHOST_SystemWin32.h
+++ b/intern/ghost/intern/GHOST_SystemWin32.h
@@ -282,7 +282,7 @@ class GHOST_SystemWin32 : public GHOST_System {
  GHOST_TSuccess exit();

  /**
-   * Converts raw WIN32 key codes from the wndproc to GHOST keys.
+   * Converts raw WIN32 key codes from the `wndproc` to GHOST keys.
   * \param vKey: The virtual key from #hardKey.
   * \param ScanCode: The ScanCode of pressed key (similar to PS/2 Set 1).
   * \param extend: Flag if key is not primly (left or right).
@@ -291,7 +291,7 @@ class GHOST_SystemWin32 : public GHOST_System {
  GHOST_TKey convertKey(short vKey, short ScanCode, short extend) const;

  /**
-   * Catches raw WIN32 key codes from WM_INPUT in the wndproc.
+   * Catches raw WIN32 key codes from WM_INPUT in the `wndproc`.
   * \param raw: RawInput structure with detailed info about the key event.
   * \param r_key_down: Set true when the key is pressed, otherwise false.
   * \return The GHOST key (GHOST_kKeyUnknown if no match).
@@ -319,8 +319,8 @@ class GHOST_SystemWin32 : public GHOST_System {
   * Creates tablet events from pointer events.
   * \param type: The type of pointer event.
   * \param window: The window receiving the event (the active window).
-   * \param wParam: The wParam from the wndproc.
-   * \param lParam: The lParam from the wndproc.
+   * \param wParam: The wParam from the `wndproc`.
+   * \param lParam: The lParam from the `wndproc`.
   * \param eventhandled: True if the method handled the event.
   */
  static void processPointerEvent(
@@ -337,8 +337,8 @@ class GHOST_SystemWin32 : public GHOST_System {
  /**
   * Handles a mouse wheel event.
   * \param window: The window receiving the event (the active window).
-   * \param wParam: The wParam from the wndproc.
-   * \param lParam: The lParam from the wndproc.
+   * \param wParam: The wParam from the `wndproc`.
+   * \param lParam: The lParam from the `wndproc`.
   */
  static void processWheelEvent(GHOST_WindowWin32 *window, WPARAM wParam, LPARAM lParam);

--- a/intern/ghost/intern/GHOST_SystemX11.cpp
+++ b/intern/ghost/intern/GHOST_SystemX11.cpp
@@ -46,7 +46,7 @@

 #ifdef WITH_X11_XFIXES
 #  include <X11/extensions/Xfixes.h>
-/* Workaround for XWayland grab glitch: T53004. */
+/* Workaround for XWayland grab glitch: #53004. */
 #  define WITH_XWAYLAND_HACK
 #endif

@@ -71,11 +71,11 @@
 #  define USE_XINPUT_HOTPLUG
 #endif

-/* see T34039 Fix Alt key glitch on Unity desktop */
+/* see #34039 Fix Alt key glitch on Unity desktop */
 #define USE_UNITY_WORKAROUND

 /* Fix 'shortcut' part of keyboard reading code only ever using first defined key-map
- * instead of active one. See T47228 and D1746 */
+ * instead of active one. See #47228 and D1746 */
 #define USE_NON_LATIN_KB_WORKAROUND

 static uchar bit_is_on(const uchar *ptr, int bit)
@@ -278,7 +278,7 @@ uint8_t GHOST_SystemX11::getNumDisplays() const
 void GHOST_SystemX11::getMainDisplayDimensions(uint32_t &width, uint32_t &height) const
 {
  if (m_display) {
-    /* NOTE(@campbellbarton): for this to work as documented,
+    /* NOTE(@ideasman42): for this to work as documented,
     * we would need to use Xinerama check r54370 for code that did this,
     * we've since removed since its not worth the extra dependency. */
    getAllDisplayDimensions(width, height);
@@ -927,8 +927,8 @@ void GHOST_SystemX11::processEvent(XEvent *xe)
          if (window->getCursorGrabMode() == GHOST_kGrabHide) {
            window->getClientBounds(bounds);

-            /* TODO(@campbellbarton): warp the cursor to `window->getCursorGrabInitPos`,
-             * on every motion event, see: D16557 (alternative fix for T102346). */
+            /* TODO(@ideasman42): warp the cursor to `window->getCursorGrabInitPos`,
+             * on every motion event, see: D16557 (alternative fix for #102346). */
            const int32_t subregion_div = 4; /* One quarter of the region. */
            const int32_t size[2] = {bounds.getWidth(), bounds.getHeight()};
            const int32_t center[2] = {
@@ -964,7 +964,7 @@ void GHOST_SystemX11::processEvent(XEvent *xe)
        if (x_new != xme.x_root || y_new != xme.y_root) {
          /* Use time of last event to avoid wrapping several times on the 'same' actual wrap.
           * Note that we need to deal with X and Y separately as those might wrap at the same time
-           * but still in two different events (corner case, see T74918).
+           * but still in two different events (corner case, see #74918).
           * We also have to add a few extra milliseconds of 'padding', as sometimes we get two
           * close events that will generate extra wrap on the same axis within those few
           * milliseconds. */
@@ -1028,7 +1028,7 @@ void GHOST_SystemX11::processEvent(XEvent *xe)
      /* XXX: Code below is kinda awfully convoluted... Issues are:
       * - In keyboards like Latin ones, numbers need a 'Shift' to be accessed but key_sym
       *   is unmodified (or anyone swapping the keys with `xmodmap`).
-       * - #XLookupKeysym seems to always use first defined key-map (see T47228), which generates
+       * - #XLookupKeysym seems to always use first defined key-map (see #47228), which generates
       *   key-codes unusable by ghost_key_from_keysym for non-Latin-compatible key-maps.
       *
       * To address this, we:
@@ -1715,7 +1715,7 @@ GHOST_TSuccess GHOST_SystemX11::setCursorPosition(int32_t x, int32_t y)

 #if defined(WITH_X11_XINPUT) && defined(USE_X11_XINPUT_WARP)
  if ((m_xinput_version.present) && (m_xinput_version.major_version >= 2)) {
-    /* Needed to account for XInput "Coordinate Transformation Matrix", see T48901 */
+    /* Needed to account for XInput "Coordinate Transformation Matrix", see #48901 */
    int device_id;
    if (XIGetClientPointer(m_display, None, &device_id) != False) {
      XIWarpPointer(m_display, device_id, None, None, 0, 0, 0, 0, relx, rely);
@@ -2015,8 +2015,8 @@ void GHOST_SystemX11::getClipboard_xcout(
        return;
      }

-      /* if it's not incr, and not format == 8, then there's
-       * nothing in the selection (that xclip understands, anyway) */
+      /* If it's not INCR, and not `format == 8`, then there's
+       * nothing in the selection (that `xclip` understands, anyway). */

      if (pty_format != 8) {
        *context = XCLIB_XCOUT_NONE;
--- a/intern/ghost/intern/GHOST_SystemX11.h
+++ b/intern/ghost/intern/GHOST_SystemX11.h
@@ -20,8 +20,8 @@

 /* Disable XINPUT warp, currently not implemented by Xorg for multi-head display.
 * (see comment in XSERVER `Xi/xiwarppointer.c` -> `FIXME: panoramix stuff is missing` ~ v1.13.4)
- * If this is supported we can add back XINPUT for warping (fixing T48901).
- * For now disable (see T50383). */
+ * If this is supported we can add back XINPUT for warping (fixing #48901).
+ * For now disable (see #50383). */
 // #  define USE_X11_XINPUT_WARP
 #endif

--- a/intern/ghost/intern/GHOST_WindowWayland.cpp
+++ b/intern/ghost/intern/GHOST_WindowWayland.cpp
@@ -669,7 +669,7 @@ static void xdg_surface_handle_configure(void *data,
  GHOST_SystemWayland *system = win->ghost_system;
  const bool is_main_thread = system->main_thread_id == std::this_thread::get_id();
  if (!is_main_thread) {
-    /* NOTE(@campbellbarton): this only gets one redraw,
+    /* NOTE(@ideasman42): this only gets one redraw,
     * I could not find a case where this causes problems. */
    gwl_window_pending_actions_tag(win, PENDING_FRAME_CONFIGURE);
  }
@@ -774,7 +774,7 @@ GHOST_WindowWayland::GHOST_WindowWayland(GHOST_SystemWayland *system,
  window_->ghost_window = this;
  window_->ghost_system = system;

-  /* NOTE(@campbellbarton): The scale set here to avoid flickering on startup.
+  /* NOTE(@ideasman42): The scale set here to avoid flickering on startup.
   * When all monitors use the same scale (which is quite common) there aren't any problems.
   *
   * When monitors have different scales there may still be a visible window resize on startup.
@@ -812,12 +812,12 @@ GHOST_WindowWayland::GHOST_WindowWayland(GHOST_SystemWayland *system,
   * when the `window_->scale` changed. */
  const int32_t size_min[2] = {320, 240};

-  /* This value is expected to match the base name of the `.desktop` file. see T101805.
+  /* This value is expected to match the base name of the `.desktop` file. see #101805.
   *
   * NOTE: the XDG desktop-entry-spec defines that this should follow the "reverse DNS" convention.
   * For e.g. `org.blender.Blender` - however the `.desktop` file distributed with Blender is
   * simply called `blender.desktop`, so the it's important to follow that name.
-   * Other distributions such as SNAP & FLATPAK may need to change this value T101779.
+   * Other distributions such as SNAP & FLATPAK may need to change this value #101779.
   * Currently there isn't a way to configure this, we may want to support that. */
  const char *xdg_app_id = (
 #ifdef WITH_GHOST_WAYLAND_APP_ID
@@ -1078,9 +1078,9 @@ GHOST_WindowWayland::~GHOST_WindowWayland()

  wl_surface_destroy(window_->wl_surface);

-  /* NOTE(@campbellbarton): Flushing will often run the appropriate handlers event
+  /* NOTE(@ideasman42): Flushing will often run the appropriate handlers event
   * (#wl_surface_listener.leave in particular) to avoid attempted access to the freed surfaces.
-   * This is not fool-proof though, hence the call to #window_surface_unref, see: T99078. */
+   * This is not fool-proof though, hence the call to #window_surface_unref, see: #99078. */
  wl_display_flush(system_->wl_display());

  delete window_;
--- a/intern/ghost/intern/GHOST_WindowWayland.h
+++ b/intern/ghost/intern/GHOST_WindowWayland.h
@@ -15,7 +15,7 @@
 #include <wayland-util.h> /* For #wl_fixed_t */

 /**
- * Define to workaround for a bug/limitation in WAYLAND, see: T100855 & upstream report:
+ * Define to workaround for a bug/limitation in WAYLAND, see: #100855 & upstream report:
 * https://gitlab.freedesktop.org/wayland/wayland/-/issues/159
 *
 * Consume events from WAYLAND in a thread, this is needed because overflowing the event queue
--- a/intern/ghost/intern/GHOST_WindowWin32.cpp
+++ b/intern/ghost/intern/GHOST_WindowWin32.cpp
@@ -154,7 +154,7 @@ GHOST_WindowWin32::GHOST_WindowWin32(GHOST_SystemWin32 *system,
  }

  if (parentwindow) {
-    /* Release any parent capture to allow immediate interaction (T90110). */
+    /* Release any parent capture to allow immediate interaction (#90110). */
    ::ReleaseCapture();
    parentwindow->lostMouseCapture();
  }
--- a/intern/ghost/intern/GHOST_WindowX11.cpp
+++ b/intern/ghost/intern/GHOST_WindowX11.cpp
@@ -418,7 +418,7 @@ void GHOST_WindowX11::refreshXInputDevices()
    for (GHOST_SystemX11::GHOST_TabletX11 &xtablet : m_system->GetXTablets()) {
      /* With modern XInput (XLIB 1.6.2 at least and/or EVDEV 2.9.0) and some 'no-name' tablets
       * like 'UC-LOGIC Tablet WP5540U', we also need to 'select' ButtonPress for motion event,
-       * otherwise we do not get any tablet motion event once pen is pressed... See T43367.
+       * otherwise we do not get any tablet motion event once pen is pressed... See #43367.
       */
      XEventClass ev;

@@ -1467,7 +1467,7 @@ GHOST_TSuccess GHOST_WindowX11::setWindowCursorGrab(GHOST_TGrabCursorMode mode)
      }
    }

-    /* Perform this last so to workaround XWayland bug, see: T53004. */
+    /* Perform this last so to workaround XWayland bug, see: #53004. */
    if (m_cursorGrab == GHOST_kGrabHide) {
      setWindowCursorVisibility(true);
    }
--- a/intern/guardedalloc/MEM_guardedalloc.h
+++ b/intern/guardedalloc/MEM_guardedalloc.h
@@ -281,6 +281,8 @@ inline T *MEM_new(const char *allocation_name, Args &&...args)
 */
 template<typename T> inline void MEM_delete(const T *ptr)
 {
+  static_assert(!std::is_void_v<T>,
+                "MEM_delete on a void pointer not possible. Cast it to a non-void type?");
  if (ptr == nullptr) {
    /* Support #ptr being null, because C++ `delete` supports that as well. */
    return;
--- a/Show More
+++ b/Show More