GPU: Cleanup GPU_batch.h documentation and some of the API for consistency

Documented all functions, adding use case and side effects. Also replace the use of shortened argument name by more meaningful ones. Renamed `GPU_batch_instbuf_add_ex` and `GPU_batch_vertbuf_add_ex` to remove the `ex` suffix as they are the main version used (removed the few usage of the other version). Renamed `GPU_batch_draw_instanced` to `GPU_batch_draw_instance_range` and make it consistent with `GPU_batch_draw_range`.
Cleanup: Move 6 sculpt-session-related files and header to C++
2023-02-11 12:52:58 -08:00 · 2023-02-11 12:52:55 -08:00 · 2023-02-11 12:50:45 -08:00 · 2023-02-11 12:50:45 -08:00 · 2023-02-11 12:50:45 -08:00 · 2023-02-11 12:50:45 -08:00
524 changed files with 8515 additions and 6198 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,5 +1,4 @@
-This repository is only used as a mirror of git.blender.org. Blender development happens on
-https://developer.blender.org.
+This repository is only used as a mirror. Blender development happens on projects.blender.org.

 To get started with contributing code, please see:
 https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -15,8 +15,7 @@ staleLabel: stale
 # Comment to post when closing a stale Issue or Pull Request.
 closeComment: >
  This issue has been automatically closed, because this repository is only
-  used as a mirror of git.blender.org. Blender development happens on
-  developer.blender.org.
+  used as a mirror. Blender development happens on projects.blender.org.

  To get started contributing code, please read:
  https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,20 +1,16 @@
 [submodule "release/scripts/addons"]
 	path = release/scripts/addons
 	url = ../blender-addons.git
-	branch = master
-	ignore = all
+	branch = main
 [submodule "release/scripts/addons_contrib"]
 	path = release/scripts/addons_contrib
 	url = ../blender-addons-contrib.git
-	branch = master
-	ignore = all
+	branch = main
 [submodule "release/datafiles/locale"]
 	path = release/datafiles/locale
 	url = ../blender-translations.git
-	branch = master
-	ignore = all
+	branch = main
 [submodule "source/tools"]
 	path = source/tools
 	url = ../blender-dev-tools.git
-	branch = master
-	ignore = all
+	branch = main
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Development
 -----------

 - [Build Instructions](https://wiki.blender.org/wiki/Building_Blender)
- [Code Review & Bug Tracker](https://developer.blender.org)
+- [Code Review & Bug Tracker](https://projects.blender.org)
 - [Developer Forum](https://devtalk.blender.org)
 - [Developer Documentation](https://wiki.blender.org)

--- a/build_files/build_environment/cmake/igc.cmake
+++ b/build_files/build_environment/cmake/igc.cmake
@@ -40,7 +40,8 @@ ExternalProject_Add(external_igc_llvm
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0004-OpenCL-support-cl_ext_float_atomics.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/clang/0005-OpenCL-Add-cl_khr_integer_dot_product.patch &&
    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0001-Memory-leak-fix-for-Managed-Static-Mutex.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch
+    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0002-Remove-repo-name-in-LLVM-IR.patch &&
+    ${PATCH_CMD} -p 1 -d ${IGC_LLVM_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/llvm/0003-Add-missing-include-limit-in-benchmark.patch
 )
 add_dependencies(
  external_igc_llvm
@@ -55,9 +56,6 @@ ExternalProject_Add(external_igc_spirv_translator
  CONFIGURE_COMMAND echo .
  BUILD_COMMAND echo .
  INSTALL_COMMAND echo .
-  PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0001-update-SPIR-V-headers-for-SPV_INTEL_split_barrier.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0002-Add-support-for-split-barriers-extension-SPV_INTEL_s.patch &&
-    ${PATCH_CMD} -p 1 -d ${IGC_SPIRV_TRANSLATOR_SOURCE_DIR} < ${IGC_OPENCL_CLANG_PATCH_DIR}/spirv/0003-Support-cl_bf16_conversions.patch
 )
 add_dependencies(
  external_igc_spirv_translator
--- a/build_files/build_environment/cmake/python.cmake
+++ b/build_files/build_environment/cmake/python.cmake
@@ -88,6 +88,19 @@ else()
    export LDFLAGS=${PYTHON_LDFLAGS} &&
    export PKG_CONFIG_PATH=${LIBDIR}/ffi/lib/pkgconfig)

+  # NOTE: untested on APPLE so far.
+  if(NOT APPLE)
+    set(PYTHON_CONFIGURE_EXTRA_ARGS
+      ${PYTHON_CONFIGURE_EXTRA_ARGS}
+      # Used on most release Linux builds (Fedora for e.g.),
+      # increases build times noticeably with the benefit of a modest speedup at runtime.
+      --enable-optimizations
+      # While LTO is OK when building on the same system, it's incompatible across GCC versions,
+      # making it impractical for developers to build against, so keep it disabled.
+      # `--with-lto`
+    )
+  endif()
+
  ExternalProject_Add(external_python
    URL file://${PACKAGE_DIR}/${PYTHON_FILE}
    DOWNLOAD_DIR ${DOWNLOAD_DIR}
--- a/build_files/build_environment/cmake/versions.cmake
+++ b/build_files/build_environment/cmake/versions.cmake
@@ -668,9 +668,9 @@ set(SPIRV_HEADERS_FILE SPIR-V-Headers-${SPIRV_HEADERS_VERSION}.tar.gz)
 # compiler, the versions used are taken from the following location
 # https://github.com/intel/intel-graphics-compiler/releases

-set(IGC_VERSION 1.0.12149.1)
+set(IGC_VERSION 1.0.13064.7)
 set(IGC_URI https://github.com/intel/intel-graphics-compiler/archive/refs/tags/igc-${IGC_VERSION}.tar.gz)
-set(IGC_HASH 44f67f24e3bc5130f9f062533abf8154782a9d0a992bc19b498639a8521ae836)
+set(IGC_HASH a929abd4cca2b293961ec0437ee4b3b2147bd3b2c8a3c423af78c0c359b2e5ae)
 set(IGC_HASH_TYPE SHA256)
 set(IGC_FILE igc-${IGC_VERSION}.tar.gz)

@@ -690,15 +690,15 @@ set(IGC_LLVM_FILE ${IGC_LLVM_VERSION}.tar.gz)
 #
 # WARNING WARNING WARNING

-set(IGC_OPENCL_CLANG_VERSION 363a5262d8c7cff3fb28f3bdb5d85c8d7e91c1bb)
+set(IGC_OPENCL_CLANG_VERSION ee31812ea8b89d08c2918f045d11a19bd33525c5)
 set(IGC_OPENCL_CLANG_URI https://github.com/intel/opencl-clang/archive/${IGC_OPENCL_CLANG_VERSION}.tar.gz)
-set(IGC_OPENCL_CLANG_HASH aa8cf72bb239722ce8ce44f79413c6887ecc8ca18477dd520aa5c4809756da9a)
+set(IGC_OPENCL_CLANG_HASH 1db6735bbcfaa31e8a9ba39f121d6bafa806ea8919e9f56782d6aaa67771ddda)
 set(IGC_OPENCL_CLANG_HASH_TYPE SHA256)
 set(IGC_OPENCL_CLANG_FILE opencl-clang-${IGC_OPENCL_CLANG_VERSION}.tar.gz)

-set(IGC_VCINTRINSICS_VERSION v0.5.0)
+set(IGC_VCINTRINSICS_VERSION v0.11.0)
 set(IGC_VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/refs/tags/${IGC_VCINTRINSICS_VERSION}.tar.gz)
-set(IGC_VCINTRINSICS_HASH 70bb47c5e32173cf61514941e83ae7c7eb4485e6d2fca60cfa1f50d4f42c41f2)
+set(IGC_VCINTRINSICS_HASH e5acd5626ce7fa6d41ce154c50ac805eda734ee66af94ef28e680ac2ad81bb9f)
 set(IGC_VCINTRINSICS_HASH_TYPE SHA256)
 set(IGC_VCINTRINSICS_FILE vc-intrinsics-${IGC_VCINTRINSICS_VERSION}.tar.gz)

@@ -714,9 +714,9 @@ set(IGC_SPIRV_TOOLS_HASH 6e19900e948944243024aedd0a201baf3854b377b9cc7a386553bc1
 set(IGC_SPIRV_TOOLS_HASH_TYPE SHA256)
 set(IGC_SPIRV_TOOLS_FILE SPIR-V-Tools-${IGC_SPIRV_TOOLS_VERSION}.tar.gz)

-set(IGC_SPIRV_TRANSLATOR_VERSION a31ffaeef77e23d500b3ea3d35e0c42ff5648ad9)
+set(IGC_SPIRV_TRANSLATOR_VERSION d739c01d65ec00dee64dedd40deed805216a7193)
 set(IGC_SPIRV_TRANSLATOR_URI https://github.com/KhronosGroup/SPIRV-LLVM-Translator/archive/${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)
-set(IGC_SPIRV_TRANSLATOR_HASH 9e26c96a45341b8f8af521bacea20e752623346340addd02af95d669f6e89252)
+set(IGC_SPIRV_TRANSLATOR_HASH ddc0cc9ccbe59dadeaf291012d59de142b2e9f2b124dbb634644d39daddaa13e)
 set(IGC_SPIRV_TRANSLATOR_HASH_TYPE SHA256)
 set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.tar.gz)

@@ -724,15 +724,15 @@ set(IGC_SPIRV_TRANSLATOR_FILE SPIR-V-Translator-${IGC_SPIRV_TRANSLATOR_VERSION}.
 ### Intel Graphics Compiler DEPS END ###
 ########################################

-set(GMMLIB_VERSION intel-gmmlib-22.1.8)
+set(GMMLIB_VERSION intel-gmmlib-22.3.0)
 set(GMMLIB_URI https://github.com/intel/gmmlib/archive/refs/tags/${GMMLIB_VERSION}.tar.gz)
-set(GMMLIB_HASH bf23e9a3742b4fb98c7666c9e9b29f3219e4b2fb4d831aaf4eed71f5e2d17368)
+set(GMMLIB_HASH c1f33e1519edfc527127baeb0436b783430dfd256c643130169a3a71dc86aff9)
 set(GMMLIB_HASH_TYPE SHA256)
 set(GMMLIB_FILE ${GMMLIB_VERSION}.tar.gz)

-set(OCLOC_VERSION 22.38.24278)
+set(OCLOC_VERSION 22.49.25018.21)
 set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC_VERSION}.tar.gz)
-set(OCLOC_HASH db0c542fccd651e6404b15a74d46027f1ce0eda8dc9e25a40cbb6c0faef257ee)
+set(OCLOC_HASH 92362dae08b503a34e5d3820ed284198c452bcd5e7504d90eb69887b20492c06)
 set(OCLOC_HASH_TYPE SHA256)
 set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)

--- a/build_files/build_environment/patches/igc_opencl_clang.diff
+++ b/build_files/build_environment/patches/igc_opencl_clang.diff
@@ -1,7 +1,7 @@
 diff -Naur external_igc_opencl_clang.orig/CMakeLists.txt external_igc_opencl_clang/CMakeLists.txt
 --- external_igc_opencl_clang.orig/CMakeLists.txt	2022-03-16 05:51:10 -0600
 +++ external_igc_opencl_clang/CMakeLists.txt	2022-05-23 10:40:09 -0600
-@@ -126,22 +126,24 @@
+@@ -147,22 +147,24 @@
         )
     endif()
 
--- a/build_files/cmake/buildinfo.cmake
+++ b/build_files/cmake/buildinfo.cmake
@@ -23,19 +23,19 @@ if(EXISTS ${SOURCE_DIR}/.git)

  if(MY_WC_BRANCH STREQUAL "HEAD")
    # Detached HEAD, check whether commit hash is reachable
-    # in the master branch
+    # in the main branch
    execute_process(COMMAND git rev-parse --short=12 HEAD
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE MY_WC_HASH
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

-    execute_process(COMMAND git branch --list master blender-v* --contains ${MY_WC_HASH}
+    execute_process(COMMAND git branch --list main blender-v* --contains ${MY_WC_HASH}
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE _git_contains_check
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

    if(NOT _git_contains_check STREQUAL "")
-      set(MY_WC_BRANCH "master")
+      set(MY_WC_BRANCH "main")
    else()
      execute_process(COMMAND git show-ref --tags -d
                      WORKING_DIRECTORY ${SOURCE_DIR}
@@ -48,7 +48,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
                      OUTPUT_STRIP_TRAILING_WHITESPACE)

      if(_git_tag_hashes MATCHES "${_git_head_hash}")
-        set(MY_WC_BRANCH "master")
+        set(MY_WC_BRANCH "main")
      else()
        execute_process(COMMAND git branch --contains ${MY_WC_HASH}
                        WORKING_DIRECTORY ${SOURCE_DIR}
--- a/build_files/cmake/example_scripts/cmake_linux_install.sh
+++ b/build_files/cmake/example_scripts/cmake_linux_install.sh
@@ -11,11 +11,11 @@
 mkdir ~/blender-git
 cd ~/blender-git

-git clone http://git.blender.org/blender.git
+git clone https://projects.blender.org/blender/blender.git
 cd blender
 git submodule update --init --recursive
-git submodule foreach git checkout master
-git submodule foreach git pull --rebase origin master
+git submodule foreach git checkout main
+git submodule foreach git pull --rebase origin main

 # create build dir
 mkdir ~/blender-git/build-cmake
@@ -35,7 +35,7 @@ ln -s ~/blender-git/build-cmake/bin/blender ~/blender-git/blender/blender.bin
 echo ""
 echo "* Useful Commands *"
 echo "   Run Blender: ~/blender-git/blender/blender.bin"
-echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin master"
+echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin main"
 echo "   Reconfigure Blender: cd ~/blender-git/build-cmake ; cmake ."
 echo "   Build Blender: cd ~/blender-git/build-cmake ; make"
 echo ""
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -5,16 +5,16 @@
 update-code:
    git:
        submodules:
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/scripts/addons
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/scripts/addons_contrib
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/datafiles/locale
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: source/tools
    svn:
--- a/build_files/utils/make_bpy_wheel.py
+++ b/build_files/utils/make_bpy_wheel.py
@@ -58,7 +58,7 @@ Each Blender release supports one Python version, and the package is only compat
 ## Source Code

 * [Releases](https://download.blender.org/source/)
-* Repository: [git.blender.org/blender.git](https://git.blender.org/gitweb/gitweb.cgi/blender.git)
+* Repository: [projects.blender.org/blender/blender.git](https://projects.blender.org/blender/blender)

 ## Credits

--- a/build_files/utils/make_update.py
+++ b/build_files/utils/make_update.py
@@ -170,7 +170,7 @@ def git_update_skip(args: argparse.Namespace, check_remote_exists: bool = True)
        return "rebase or merge in progress, complete it first"

    # Abort if uncommitted changes.
-    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no'])
+    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no', '--ignore-submodules'])
    if len(changes) != 0:
        return "you have unstaged changes"

@@ -202,8 +202,8 @@ def submodules_update(
        sys.exit(1)

    # Update submodules to appropriate given branch,
-    # falling back to master if none is given and/or found in a sub-repository.
-    branch_fallback = "master"
+    # falling back to main if none is given and/or found in a sub-repository.
+    branch_fallback = "main"
    if not branch:
        branch = branch_fallback

--- a/build_files/windows/check_submodules.cmd
+++ b/build_files/windows/check_submodules.cmd
@@ -3,9 +3,9 @@ if NOT exist "%BLENDER_DIR%\source\tools\.git" (
 	if not "%GIT%" == "" (
 		"%GIT%" submodule update --init --recursive --progress
 		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git checkout master
+		"%GIT%" submodule foreach git checkout main
 		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git pull --rebase origin master
+		"%GIT%" submodule foreach git pull --rebase origin main
 		if errorlevel 1 goto FAIL
 		goto EOF
 	) else (
--- a/doc/python_api/examples/blf.py
+++ b/doc/python_api/examples/blf.py
@@ -37,7 +37,7 @@ def draw_callback_px(self, context):
    # BLF drawing routine
    font_id = font_info["font_id"]
    blf.position(font_id, 2, 80, 0)
-    blf.size(font_id, 50, 72)
+    blf.size(font_id, 50)
    blf.draw(font_id, "Hello World")


--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -1816,9 +1816,9 @@ def pyrna2sphinx(basepath):

    # operators
    def write_ops():
-        API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts"
-        API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA"
-        API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC"
+        API_BASEURL = "https://projects.blender.org/blender/blender/src/branch/main/release/scripts"
+        API_BASEURL_ADDON = "https://projects.blender.org/blender/blender-addons"
+        API_BASEURL_ADDON_CONTRIB = "https://projects.blender.org/blender/blender-addons-contrib"

        op_modules = {}
        op = None
--- a/doc/python_api/static/js/version_switch.js
+++ b/doc/python_api/static/js/version_switch.js
@@ -156,7 +156,7 @@ var Popover = function() {
    },
    getNamed : function(v) {
      $.each(all_versions, function(ix, title) {
-        if (ix === "master" || ix === "latest") {
+        if (ix === "master" || ix === "main" || ix === "latest") {
          var m = title.match(/\d\.\d[\w\d\.]*/)[0];
          if (parseFloat(m) == v) {
            v = ix;
--- a/extern/hipew/README.blender
+++ b/extern/hipew/README.blender
@@ -1,5 +1,5 @@
 Project: Blender
-URL: https://git.blender.org/blender.git
+URL: https://projects.blender.org/blender/blender.git
 License: Apache 2.0
 Upstream version: N/A
 Local modifications: None
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -12,6 +12,7 @@ from bpy.props import (
    PointerProperty,
    StringProperty,
 )
+from bpy.app.translations import pgettext_iface as iface_

 from math import pi

@@ -1664,30 +1665,48 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            col.label(text="No compatible GPUs found for Cycles", icon='INFO')

            if device_type == 'CUDA':
-                col.label(text="Requires NVIDIA GPU with compute capability 3.0", icon='BLANK1')
+                compute_capability = "3.0"
+                col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
+                          icon='BLANK1', translate=False)
            elif device_type == 'OPTIX':
-                col.label(text="Requires NVIDIA GPU with compute capability 5.0", icon='BLANK1')
-                col.label(text="and NVIDIA driver version 470 or newer", icon='BLANK1')
+                compute_capability = "5.0"
+                driver_version = "470"
+                col.label(text=iface_("Requires NVIDIA GPU with compute capability %s") % compute_capability,
+                          icon='BLANK1', translate=False)
+                col.label(text="and NVIDIA driver version %s or newer" % driver_version,
+                          icon='BLANK1', translate=False)
            elif device_type == 'HIP':
                import sys
                if sys.platform[:3] == "win":
+                    driver_version = "21.Q4"
                    col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
-                    col.label(text="and AMD Radeon Pro 21.Q4 driver or newer", icon='BLANK1')
+                    col.label(text=iface_("and AMD Radeon Pro %s driver or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                elif sys.platform.startswith("linux"):
+                    driver_version = "22.10"
                    col.label(text="Requires AMD GPU with RDNA architecture", icon='BLANK1')
-                    col.label(text="and AMD driver version 22.10 or newer", icon='BLANK1')
+                    col.label(text=iface_("and AMD driver version %s or newer") % driver_version, icon='BLANK1',
+                              translate=False)
            elif device_type == 'ONEAPI':
                import sys
                if sys.platform.startswith("win"):
+                    driver_version = "101.4032"
                    col.label(text="Requires Intel GPU with Xe-HPG architecture", icon='BLANK1')
-                    col.label(text="and Windows driver version 101.4032 or newer", icon='BLANK1')
+                    col.label(text=iface_("and Windows driver version %s or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                elif sys.platform.startswith("linux"):
+                    driver_version = "1.3.24931"
                    col.label(text="Requires Intel GPU with Xe-HPG architecture and", icon='BLANK1')
-                    col.label(text="  - intel-level-zero-gpu version 1.3.24931 or newer", icon='BLANK1')
+                    col.label(text=iface_("  - intel-level-zero-gpu version %s or newer") % driver_version,
+                              icon='BLANK1', translate=False)
                    col.label(text="  - oneAPI Level-Zero Loader", icon='BLANK1')
            elif device_type == 'METAL':
-                col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
-                col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1')
+                silicon_mac_version = "12.2"
+                amd_mac_version = "12.3"
+                col.label(text=iface_("Requires Apple Silicon with macOS %s or newer") % silicon_mac_version,
+                          icon='BLANK1', translate=False)
+                col.label(text=iface_("or AMD with macOS %s or newer") % amd_mac_version, icon='BLANK1',
+                          translate=False)
            return

        for device in devices:
@@ -1723,12 +1742,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):

        if compute_device_type == 'METAL':
            import platform
-            # MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
-            # Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
-            if platform.machine() == 'arm64':
+            import re
+            is_navi_2 = False
+            for device in devices:
+                if re.search(r"((RX)|(Pro)|(PRO))\s+W?6\d00X", device.name):
+                    is_navi_2 = True
+                    break
+
+            # MetalRT only works on Apple Silicon and Navi2.
+            is_arm64 = platform.machine() == 'arm64'
+            if is_arm64 or is_navi_2:
                col = layout.column()
                col.use_property_split = True
-                col.prop(self, "kernel_optimization_level")
+                # Kernel specialization is only supported on Apple Silicon
+                if is_arm64:
+                    col.prop(self, "kernel_optimization_level")
                col.prop(self, "use_metalrt")

    def draw(self, context):
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error)
 }

 CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
+  static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
+
  first_error = true;

  cuDevId = info.num;
@@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize CUDA. */
@@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  int value;
+  cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  can_map_host = value != 0;

  cuda_assert(cuDeviceGetAttribute(
      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
 {
  CUDAContextScope scope(this);

-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void CUDADevice::generic_copy_to(device_memory &mem)
+bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  CUDAContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
+  return mem_alloc_result == CUDA_SUCCESS;
 }

-void CUDADevice::generic_free(device_memory &mem)
+void CUDADevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+  CUDAContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
+bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  CUDAContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  CUresult mem_alloc_result = cuMemHostAlloc(
+      &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+  return mem_alloc_result == CUDA_SUCCESS;
+}

-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
+void CUDADevice::free_host(void *shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuMemFreeHost(shared_pointer);
+}
+
+bool CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
+  return true;
+}
+
+void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
 }

 void CUDADevice::mem_alloc(device_memory &mem)
@@ -868,8 +613,8 @@ void CUDADevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const CUDAContextScope scope(this);
    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
  }
@@ -994,19 +739,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
      return;
  }

-  CUDAMem *cmem = NULL;
+  Mem *cmem = NULL;
  CUarray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -1050,10 +795,10 @@ void CUDADevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@@ -1137,8 +882,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@@ -1153,9 +898,9 @@ void CUDADevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@@ -1164,16 +909,16 @@ void CUDADevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      cuArrayDestroy(cmem.array);
+      cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class CUDADevice : public Device {
+class CUDADevice : public GPUDevice {

  friend class CUDAContextScope;

@@ -29,36 +29,11 @@ class CUDADevice : public Device {
  CUdevice cuDevice;
  CUcontext cuContext;
  CUmodule cuModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int cuDevId;
  int cuDevArchitecture;
  bool first_error;

-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  CUDADeviceKernels kernels;

  static bool have_precompiled_kernels();
@@ -88,17 +63,13 @@ class CUDADevice : public Device {

  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -452,6 +452,320 @@ void *Device::get_cpu_osl_memory()
  return nullptr;
 }

+GPUDevice::~GPUDevice() noexcept(false)
+{
+}
+
+bool GPUDevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
+void GPUDevice::init_host_memory(size_t preferred_texture_headroom,
+                                 size_t preferred_working_headroom)
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower than the working one so there
+   * is space left for it. */
+  device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom :
+                                                             32 * 1024 * 1024LL;  // 32MB
+  device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom :
+                                                             128 * 1024 * 1024LL;  // 128MB
+
+  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(device_mem_map_mutex);
+    foreach (MemMap::value_type &pair, device_mem_map) {
+      device_memory &mem = *pair.first;
+      Mem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple backend devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  void *device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  bool mem_alloc_result = false;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  get_device_memory_info(total, free);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    get_device_memory_info(total, free);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = alloc_device(device_pointer, size);
+    if (mem_alloc_result) {
+      device_mem_in_use += size;
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = true;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = alloc_host(shared_pointer, size);
+
+      assert((mem_alloc_result && shared_pointer != 0) ||
+             (!mem_alloc_result && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result) {
+      assert(transform_host_pointer(device_pointer, shared_pointer));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (!mem_alloc_result) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      status = " failed, out of device memory";
+      set_error("System is out of GPU memory");
+    }
+    else {
+      status = " failed, out of device and host memory";
+      set_error("System is out of GPU and shared host memory");
+    }
+  }
+
+  if (mem.name) {
+    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  Mem *cmem = &device_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void GPUDevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          free_host(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      free_device((void *)mem.device_pointer);
+      device_mem_in_use -= mem.device_size;
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    device_mem_map.erase(device_mem_map.find(&mem));
+  }
+}
+
+void GPUDevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should
+   * copy data from mem.host_pointer. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size());
+  }
+}
+
 /* DeviceInfo */

 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -309,6 +309,93 @@ class Device {
  static uint devices_initialized_mask;
 };

+/* Device, which is GPU, with some common functionality for GPU backends */
+class GPUDevice : public Device {
+ protected:
+  GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_),
+        texture_info(this, "texture_info", MEM_GLOBAL),
+        need_texture_info(false),
+        can_map_host(false),
+        map_host_used(0),
+        map_host_limit(0),
+        device_texture_headroom(0),
+        device_working_headroom(0),
+        device_mem_map(),
+        device_mem_map_mutex(),
+        move_texture_to_host(false),
+        device_mem_in_use(0)
+  {
+  }
+
+ public:
+  virtual ~GPUDevice() noexcept(false);
+
+  /* For GPUs that can use bindless textures in some way or another. */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  virtual bool load_texture_info();
+
+ protected:
+  /* Memory allocation, only accessed through device_memory. */
+  friend class device_memory;
+
+  bool can_map_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  typedef unsigned long long texMemObject;
+  typedef unsigned long long arrayMemObject;
+  struct Mem {
+    Mem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    texMemObject texobject;
+    arrayMemObject array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, Mem> MemMap;
+  MemMap device_mem_map;
+  thread_mutex device_mem_map_mutex;
+  bool move_texture_to_host;
+  /* Simple counter which will try to track amount of used device memory */
+  size_t device_mem_in_use;
+
+  virtual void init_host_memory(size_t preferred_texture_headroom = 0,
+                                size_t preferred_working_headroom = 0);
+  virtual void move_textures_to_host(size_t size, bool for_texture);
+
+  /* Allocation, deallocation and copy functions, with corresponding
+   * support of device/host allocations. */
+  virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+  virtual void generic_free(device_memory &mem);
+  virtual void generic_copy_to(device_memory &mem);
+
+  /* total - amount of device memory, free - amount of available device memory */
+  virtual void get_device_memory_info(size_t &total, size_t &free) = 0;
+
+  virtual bool alloc_device(void *&device_pointer, size_t size) = 0;
+
+  virtual void free_device(void *device_pointer) = 0;
+
+  virtual bool alloc_host(void *&shared_pointer, size_t size) = 0;
+
+  virtual void free_host(void *shared_pointer) = 0;
+
+  /* This function should return device pointer corresponding to shared pointer, which
+   * is host buffer, allocated in `alloc_host`. The function should `true`, if such
+   * address transformation is possible and `false` otherwise. */
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0;
+
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0;
+};
+
 CCL_NAMESPACE_END

 #endif /* __DEVICE_H__ */
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -53,8 +53,12 @@ void HIPDevice::set_error(const string &error)
 }

 HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t));
+  static_assert(sizeof(arrayMemObject) == sizeof(hArray));
+
  first_error = true;

  hipDevId = info.num;
@@ -65,12 +69,6 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize HIP. */
@@ -91,7 +89,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* hipDeviceMapHost for mapping host memory when out of device memory.
   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  int value;
+  hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  can_map_host = value != 0;

  hip_assert(
      hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
@@ -460,305 +460,58 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void HIPDevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void HIPDevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    foreach (HIPMemMap::value_type &pair, hip_mem_map) {
-      device_memory &mem = *pair.first;
-      HIPMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple HIP devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void HIPDevice::get_device_memory_info(size_t &total, size_t &free)
 {
  HIPContextScope scope(this);

-  hipDeviceptr_t device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  hipError_t mem_alloc_result = hipErrorOutOfMemory;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  hipMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    hipMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = hipMalloc(&device_pointer, size);
-    if (mem_alloc_result == hipSuccess) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != hipSuccess && can_map_host) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = hipSuccess;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = hipHostMalloc(
-          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
-
-      assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
-             (mem_alloc_result != hipSuccess && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == hipSuccess) {
-      hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != hipSuccess) {
-    status = " failed, out of device and host memory";
-    set_error("System is out of GPU and shared host memory");
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  HIPMem *cmem = &hip_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * HIP memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void HIPDevice::generic_copy_to(device_memory &mem)
+bool HIPDevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  HIPContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const HIPContextScope scope(this);
-    hip_assert(
-        hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size);
+  return mem_alloc_result == hipSuccess;
 }

-void HIPDevice::generic_free(device_memory &mem)
+void HIPDevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+  HIPContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  hip_assert(hipFree((hipDeviceptr_t)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          hipHostFree(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      hip_assert(hipFree(mem.device_pointer));
-    }
+bool HIPDevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  HIPContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  hipError_t mem_alloc_result = hipHostMalloc(
+      &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);

-    hip_mem_map.erase(hip_mem_map.find(&mem));
-  }
+  return mem_alloc_result == hipSuccess;
+}
+
+void HIPDevice::free_host(void *shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hipHostFree(shared_pointer);
+}
+
+bool HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0));
+  return true;
+}
+
+void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const HIPContextScope scope(this);
+
+  hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size));
 }

 void HIPDevice::mem_alloc(device_memory &mem)
@@ -823,8 +576,8 @@ void HIPDevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const HIPContextScope scope(this);
    hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
  }
@@ -951,19 +704,19 @@ void HIPDevice::tex_alloc(device_texture &mem)
      return;
  }

-  HIPMem *cmem = NULL;
+  Mem *cmem = NULL;
  hArray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (hArray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@@ -1007,10 +760,10 @@ void HIPDevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@@ -1095,8 +848,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@@ -1111,9 +864,9 @@ void HIPDevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@@ -1122,16 +875,16 @@ void HIPDevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      hipArrayDestroy(cmem.array);
+      hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class HIPDevice : public Device {
+class HIPDevice : public GPUDevice {

  friend class HIPContextScope;

@@ -26,36 +26,11 @@ class HIPDevice : public Device {
  hipDevice_t hipDevice;
  hipCtx_t hipContext;
  hipModule_t hipModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int hipDevId;
  int hipDevArchitecture;
  bool first_error;

-  struct HIPMem {
-    HIPMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    hipTextureObject_t texobject;
-    hArray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, HIPMem> HIPMemMap;
-  HIPMemMap hip_mem_map;
-  thread_mutex hip_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  HIPDeviceKernels kernels;

  static bool have_precompiled_kernels();
@@ -81,17 +56,13 @@ class HIPDevice : public Device {
  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -73,6 +73,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
      return "integrator_terminated_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+      return "integrator_sort_bucket_pass";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
+      return "integrator_sort_write_pass";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
      return "integrator_compact_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -247,6 +247,8 @@ class device_memory {
  bool is_resident(Device *sub_device) const;

 protected:
+  friend class Device;
+  friend class GPUDevice;
  friend class CUDADevice;
  friend class OptiXDevice;
  friend class HIPDevice;
--- a/intern/cycles/device/metal/bvh.h
+++ b/intern/cycles/device/metal/bvh.h
@@ -21,6 +21,7 @@ class BVHMetal : public BVH {

  API_AVAILABLE(macos(11.0))
  vector<id<MTLAccelerationStructure>> blas_array;
+  vector<uint32_t> blas_lookup;

  bool motion_blur = false;

--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress,

    uint32_t instance_index = 0;
    uint32_t motion_transform_index = 0;
+
+    // allocate look up buffer for wost case scenario
+    uint64_t count = objects.size();
+    blas_lookup.resize(count);
+
    for (Object *ob : objects) {
      /* Skip non-traceable objects */
      if (!ob->is_traceable())
@@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress,
      /* Set user instance ID to object index */
      int object_index = ob->get_device_index();
      uint32_t user_id = uint32_t(object_index);
+      int currIndex = instance_index++;
+      assert(user_id < blas_lookup.size());
+      blas_lookup[user_id] = accel_struct_index;

      /* Bake into the appropriate descriptor */
      if (motion_blur) {
        MTLAccelerationStructureMotionInstanceDescriptor *instances =
            (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];

        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
@@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
      else {
        MTLAccelerationStructureUserIDInstanceDescriptor *instances =
            (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];

        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
--- a/intern/cycles/device/metal/device.mm
+++ b/intern/cycles/device/metal/device.mm
@@ -55,6 +55,10 @@ void device_metal_info(vector<DeviceInfo> &devices)
    info.denoisers = DENOISER_NONE;
    info.id = id;

+    if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
+      info.has_light_tree = false;
+    }
+
    devices.push_back(info);
    device_index++;
  }
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -74,6 +74,11 @@ class MetalDevice : public Device {
  id<MTLBuffer> texture_bindings_3d = nil;
  std::vector<id<MTLTexture>> texture_slot_map;

+  /* BLAS encoding & lookup */
+  id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
+  id<MTLBuffer> blas_buffer = nil;
+  id<MTLBuffer> blas_lookup_buffer = nil;
+
  bool use_metalrt = false;
  MetalPipelineType kernel_specialization_level = PSO_GENERIC;

@@ -105,6 +110,8 @@ class MetalDevice : public Device {

  bool use_adaptive_compilation();

+  bool use_local_atomic_sort() const;
+
  bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);

  void make_source(MetalPipelineType pso_type, const uint kernel_features);
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -192,6 +192,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
        arg_desc_as.access = MTLArgumentAccessReadOnly;

+        MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
+        arg_desc_ptrs.dataType = MTLDataTypePointer;
+        arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
+
        MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
        arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
        arg_desc_ift.access = MTLArgumentAccessReadOnly;
@@ -204,14 +208,28 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
        arg_desc_ift.index = index++;
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
+        arg_desc_ift.index = index++;
+        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
+        arg_desc_ptrs.index = index++;
+        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
+        arg_desc_ptrs.index = index++;
+        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */

        [arg_desc_ift release];
        [arg_desc_as release];
+        [arg_desc_ptrs release];
      }
    }

    mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];

+    // preparing the blas arg encoder
+    MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
+    arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
+    arg_desc_blas.access = MTLArgumentAccessReadOnly;
+    mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
+    [arg_desc_blas release];
+
    for (int i = 0; i < ancillary_desc.count; i++) {
      [ancillary_desc[i] release];
    }
@@ -271,6 +289,11 @@ bool MetalDevice::use_adaptive_compilation()
  return DebugFlags().metal.adaptive_compile;
 }

+bool MetalDevice::use_local_atomic_sort() const
+{
+  return DebugFlags().metal.use_local_atomic_sort;
+}
+
 void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
 {
  string global_defines;
@@ -278,6 +301,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }

+  if (use_local_atomic_sort()) {
+    global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
+  }
+
  if (use_metalrt) {
    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
@@ -1231,6 +1258,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    if (@available(macos 11.0, *)) {
      if (bvh->params.top_level) {
        bvhMetalRT = bvh_metal;
+
+        // allocate required buffers for BLAS array
+        uint64_t count = bvhMetalRT->blas_array.size();
+        uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
+        blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
+        stats.mem_alloc(blas_buffer.allocatedSize);
+
+        for (uint64_t i = 0; i < count; ++i) {
+          [mtlBlasArgEncoder setArgumentBuffer:blas_buffer
+                                        offset:i * mtlBlasArgEncoder.encodedLength];
+          [mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
+        }
+
+        count = bvhMetalRT->blas_lookup.size();
+        bufferSize = sizeof(uint32_t) * count;
+        blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
+                                                    options:default_storage_mode];
+        stats.mem_alloc(blas_lookup_buffer.allocatedSize);
+
+        memcpy([blas_lookup_buffer contents],
+               bvhMetalRT -> blas_lookup.data(),
+               blas_lookup_buffer.allocatedSize);
+
+        if (default_storage_mode == MTLResourceStorageModeManaged) {
+          [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
+          [blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
+        }
      }
    }
  }
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -19,6 +19,8 @@ enum {
  METALRT_FUNC_SHADOW_BOX,
  METALRT_FUNC_LOCAL_TRI,
  METALRT_FUNC_LOCAL_BOX,
+  METALRT_FUNC_LOCAL_TRI_PRIM,
+  METALRT_FUNC_LOCAL_BOX_PRIM,
  METALRT_FUNC_CURVE_RIBBON,
  METALRT_FUNC_CURVE_RIBBON_SHADOW,
  METALRT_FUNC_CURVE_ALL,
@@ -28,7 +30,13 @@ enum {
  METALRT_FUNC_NUM
 };

-enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
+enum {
+  METALRT_TABLE_DEFAULT,
+  METALRT_TABLE_SHADOW,
+  METALRT_TABLE_LOCAL,
+  METALRT_TABLE_LOCAL_PRIM,
+  METALRT_TABLE_NUM
+};

 /* Pipeline State Object types */
 enum MetalPipelineType {
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -87,6 +87,9 @@ struct ShaderCache {
          break;
      }
    }
+
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
  }
  ~ShaderCache();

@@ -521,6 +524,8 @@ void MetalKernelPipeline::compile()
          "__anyhit__cycles_metalrt_shadow_all_hit_box",
          "__anyhit__cycles_metalrt_local_hit_tri",
          "__anyhit__cycles_metalrt_local_hit_box",
+          "__anyhit__cycles_metalrt_local_hit_tri_prim",
+          "__anyhit__cycles_metalrt_local_hit_box_prim",
          "__intersection__curve_ribbon",
          "__intersection__curve_ribbon_shadow",
          "__intersection__curve_all",
@@ -611,11 +616,17 @@ void MetalKernelPipeline::compile()
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         nil];
+    table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
+        arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+                         nil];

    NSMutableSet *unique_functions = [NSMutableSet
        setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
+    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];

    if (kernel_has_intersection(device_kernel)) {
      linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -25,6 +25,7 @@ class MetalDeviceQueue : public DeviceQueue {
  virtual int num_concurrent_states(const size_t) const override;
  virtual int num_concurrent_busy_states(const size_t) const override;
  virtual int num_sort_partition_elements() const override;
+  virtual bool supports_local_atomic_sort() const override;

  virtual void init_execution() override;

--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -315,6 +315,11 @@ int MetalDeviceQueue::num_sort_partition_elements() const
  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
 }

+bool MetalDeviceQueue::supports_local_atomic_sort() const
+{
+  return metal_device_->use_local_atomic_sort();
+}
+
 void MetalDeviceQueue::init_execution()
 {
  /* Synchronize all textures and memory copies before executing task. */
@@ -477,6 +482,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (metal_device_->bvhMetalRT) {
        id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
        [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
+        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
+                                                  offset:0
+                                                 atIndex:7];
+        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
+                                                  offset:0
+                                                 atIndex:8];
      }

      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@@ -527,6 +538,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (bvhMetalRT) {
        /* Mark all Accelerations resources as used */
        [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
+                                        usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResource:metal_device_->blas_lookup_buffer
+                                        usage:MTLResourceUsageRead];
        [mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
                                         count:bvhMetalRT->blas_array.size()
                                         usage:MTLResourceUsageRead];
@@ -553,13 +568,24 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      /* See parallel_active_index.h for why this amount of shared memory is needed.
       * Rounded up to 16 bytes for Metal */
      shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
-      [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
      break;

+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+      int key_count = metal_device_->launch_params.data.max_shaders;
+      shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
+      break;
+    }
+
    default:
      break;
  }

+  if (shared_mem_bytes) {
+    assert(shared_mem_bytes <= 32 * 1024);
+    [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
+  }
+
  MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
      divide_up(work_size, num_threads_per_block), 1, 1);
  MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -64,6 +64,12 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
    return METAL_GPU_INTEL;
  }
  else if (strstr(device_name, "AMD")) {
+    /* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */
+    if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) {
+      if (atoi(str)) {
+        return METAL_GPU_UNKNOWN;
+      }
+    }
    return METAL_GPU_AMD;
  }
  else if (strstr(device_name, "Apple")) {
@@ -96,6 +102,15 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    return usable_devices;
  }

+  /* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
+   * one. This can be overridden with CYCLES_METAL_FORCE_INTEL. */
+  bool has_usable_amd_gpu = false;
+  if (@available(macos 12.3, *)) {
+    for (id<MTLDevice> device in MTLCopyAllDevices()) {
+      has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD);
+    }
+  }
+
  metal_printf("Usable Metal devices:\n");
  for (id<MTLDevice> device in MTLCopyAllDevices()) {
    string device_name = get_device_name(device);
@@ -111,8 +126,10 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    }

 #  if defined(MAC_OS_VERSION_13_0)
-    if (@available(macos 13.0, *)) {
-      usable |= (vendor == METAL_GPU_INTEL);
+    if (!has_usable_amd_gpu) {
+      if (@available(macos 13.0, *)) {
+        usable |= (vendor == METAL_GPU_INTEL);
+      }
    }
 #  endif

--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -854,12 +854,14 @@ bool OptiXDevice::load_osl_kernels()
        context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
  }

+  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
  vector<OptixStackSizes> osl_stack_size(osl_groups.size());

  /* Update SBT with new entries. */
  sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
  }
  for (size_t i = 0; i < osl_groups.size(); ++i) {
    if (osl_groups[i] != NULL) {
@@ -907,13 +909,15 @@ bool OptiXDevice::load_osl_kernels()
                                     0,
                                     &pipelines[PIP_SHADE]));

+    const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
+                                      stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
    unsigned int dss = 0;
    for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
      dss = std::max(dss, osl_stack_size[i].dssDC);
    }

    optix_assert(optixPipelineSetStackSize(
-        pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
+        pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
  }

  return !have_error();
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -112,6 +112,13 @@ class DeviceQueue {
    return 65536;
  }

+  /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
+   * INTEGRATOR_SORT_WRITE_PASS)? */
+  virtual bool supports_local_atomic_sort() const
+  {
+    return false;
+  }
+
  /* Initialize execution of kernels on this queue.
   *
   * Will, for example, load all data required by the kernels from Device to global or path state.
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -71,6 +71,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
          device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
      integrator_shader_sort_prefix_sum_(
          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
+      integrator_shader_sort_partition_key_offsets_(
+          device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
      integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
      integrator_next_shadow_path_index_(
          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
@@ -207,33 +209,45 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
                                                                num_sort_partitions_);

-  /* Allocate arrays for shader sorting. */
-  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
-  if (integrator_shader_sort_counter_.size() < sort_buckets) {
-    integrator_shader_sort_counter_.alloc(sort_buckets);
-    integrator_shader_sort_counter_.zero_to_device();
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
-        (int *)integrator_shader_sort_counter_.device_pointer;
-
-    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
-    integrator_shader_sort_prefix_sum_.zero_to_device();
-  }
-
-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-    if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
-      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
-      integrator_shader_raytrace_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
-          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    /* Allocate array for partitioned shader sorting using local atomics. */
+    const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
+    if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
+      integrator_shader_sort_partition_key_offsets_.alloc(num_offsets);
+      integrator_shader_sort_partition_key_offsets_.zero_to_device();
    }
+    integrator_state_gpu_.sort_partition_key_offsets =
+        (int *)integrator_shader_sort_partition_key_offsets_.device_pointer;
  }
+  else {
+    /* Allocate arrays for shader sorting. */
+    const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+    if (integrator_shader_sort_counter_.size() < sort_buckets) {
+      integrator_shader_sort_counter_.alloc(sort_buckets);
+      integrator_shader_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+          (int *)integrator_shader_sort_counter_.device_pointer;

-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
-    if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
-      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
-      integrator_shader_mnee_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
-          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
+      integrator_shader_sort_prefix_sum_.zero_to_device();
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
+        integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
+        integrator_shader_raytrace_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+            (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+      }
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+      if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
+        integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+        integrator_shader_mnee_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+            (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      }
    }
  }
 }
@@ -451,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
    work_size = num_queued;
    d_path_index = queued_paths_.device_pointer;

-    compute_sorted_queued_paths(
-        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
+    compute_sorted_queued_paths(kernel, num_paths_limit);
  }
  else if (num_queued < work_size) {
    work_size = num_queued;
@@ -511,11 +524,26 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
  }
 }

-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
-                                                   DeviceKernel queued_kernel,
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
                                                   const int num_paths_limit)
 {
  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    const int work_size = kernel_max_active_main_path_index(queued_kernel);
+    device_ptr d_queued_paths = queued_paths_.device_pointer;
+
+    int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
+
+    DeviceKernelArguments args(
+        &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
+
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
+    return;
+  }
+
  device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
  device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
  assert(d_counter != 0 && d_prefix_sum != 0);
@@ -552,7 +580,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
                               &d_prefix_sum,
                               &d_queued_kernel);

-    queue_->enqueue(kernel, work_size, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, work_size, args);
  }
 }

--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -70,9 +70,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);

  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel,
-                                   DeviceKernel queued_kernel,
-                                   const int num_paths_limit);
+  void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit);

  void compact_main_paths(const int num_active_paths);
  void compact_shadow_paths();
@@ -135,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  device_vector<int> integrator_shader_raytrace_sort_counter_;
  device_vector<int> integrator_shader_mnee_sort_counter_;
  device_vector<int> integrator_shader_sort_prefix_sum_;
+  device_vector<int> integrator_shader_sort_partition_key_offsets_;
  /* Path split. */
  device_vector<int> integrator_next_main_path_index_;
  device_vector<int> integrator_next_shadow_path_index_;
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -661,7 +661,8 @@ ccl_device void bsdf_blur(KernelGlobals kg, ccl_private ShaderClosure *sc, float
 #endif
 }

-ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd, ccl_private const ShaderClosure *sc)
+ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd,
+                                       ccl_private const ShaderClosure *sc)
 {
  Spectrum albedo = sc->weight;
  /* Some closures include additional components such as Fresnel terms that cause their albedo to
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -519,14 +519,6 @@ ccl_device int bsdf_microfacet_ggx_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }

-/* Required to maintain OSL interface. */
-ccl_device int bsdf_microfacet_ggx_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  return bsdf_microfacet_ggx_setup(bsdf);
-}
-
 ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
                                                 ccl_private const ShaderData *sd)
 {
@@ -613,14 +605,6 @@ ccl_device int bsdf_microfacet_beckmann_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }

-/* Required to maintain OSL interface. */
-ccl_device int bsdf_microfacet_beckmann_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  return bsdf_microfacet_beckmann_setup(bsdf);
-}
-
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ccl_private MicrofacetBsdf *bsdf)
 {
  bsdf->alpha_x = saturatef(bsdf->alpha_x);
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@@ -90,8 +90,10 @@ ccl_device float schlick_fresnel(float u)
 }

 /* Calculate the fresnel color, which is a blend between white and the F0 color */
-ccl_device_forceinline Spectrum
-interpolate_fresnel_color(float3 L, float3 H, float ior, Spectrum F0)
+ccl_device_forceinline Spectrum interpolate_fresnel_color(float3 L,
+                                                          float3 H,
+                                                          float ior,
+                                                          Spectrum F0)
 {
  /* Compute the real Fresnel term and remap it from real_F0..1 to F0..1.
   * The reason why we use this remapping instead of directly doing the
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -401,6 +401,72 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
 }
 ccl_gpu_kernel_postfix

+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_bucket_pass(num_states,
+                                partition_size,
+                                max_shaders,
+                                kernel_index,
+                                d_queued_kernel,
+                                d_shader_sort_key,
+                                key_offsets,
+                                (threadgroup int *)threadgroup_array,
+                                metal_local_id,
+                                metal_local_size,
+                                metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_write_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_write_pass(num_states,
+                               partition_size,
+                               max_shaders,
+                               kernel_index,
+                               num_states_limit,
+                               indices,
+                               d_queued_kernel,
+                               d_shader_sort_key,
+                               key_offsets,
+                               (threadgroup int *)threadgroup_array,
+                               metal_local_id,
+                               metal_local_size,
+                               metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
 ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_paths_array,
                             int num_states,
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -178,7 +178,7 @@ __device__
                                         simd_lane_index, \
                                         simd_group_index, \
                                         num_simd_groups, \
-                                         simdgroup_offset)
+                                         (threadgroup int *)threadgroup_array)
 #elif defined(__KERNEL_ONEAPI__)

 #  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -19,6 +19,115 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
+
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+
+#  define atomic_store_local(p, x) \
+    atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
+#  define atomic_load_local(p) \
+    atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
+
+ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
+                                                     const uint partition_size,
+                                                     const uint max_shaders,
+                                                     const uint queued_kernel,
+                                                     ccl_global ushort *d_queued_kernel,
+                                                     ccl_global uint *d_shader_sort_key,
+                                                     ccl_global int *partition_key_offsets,
+                                                     ccl_gpu_shared int *buckets,
+                                                     const ushort local_id,
+                                                     const ushort local_size,
+                                                     const ushort grid_id)
+{
+  /* Zero the bucket sizes. */
+  if (local_id < max_shaders) {
+    atomic_store_local(&buckets[local_id], 0);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Determine bucket sizes within the partitions. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      atomic_fetch_and_add_uint32(&buckets[key], 1);
+    }
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
+
+  if (local_id == 0) {
+    int offset = 0;
+    for (int i = 0; i < max_shaders; i++) {
+      partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
+      offset = offset + atomic_load_local(&buckets[i]);
+    }
+
+    /* Store the number of active states in this partition. */
+    partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
+  }
+}
+
+ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
+                                                    const uint partition_size,
+                                                    const uint max_shaders,
+                                                    const uint queued_kernel,
+                                                    const int num_states_limit,
+                                                    ccl_global int *indices,
+                                                    ccl_global ushort *d_queued_kernel,
+                                                    ccl_global uint *d_shader_sort_key,
+                                                    ccl_global int *partition_key_offsets,
+                                                    ccl_gpu_shared int *local_offset,
+                                                    const ushort local_id,
+                                                    const ushort local_size,
+                                                    const ushort grid_id)
+{
+  /* Calculate each partition's global offset from the prefix sum of the active state counts per
+   * partition. */
+
+  if (local_id < max_shaders) {
+    int partition_offset = 0;
+    for (int i = 0; i < uint(grid_id); i++) {
+      int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
+      partition_offset += partition_key_count;
+    }
+
+    ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
+    atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write the sorted active indices. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
+      if (index < num_states_limit) {
+        indices[index] = state_index;
+      }
+    }
+  }
+}
+
+#endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */

 template<typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint state_index,
--- a/intern/cycles/kernel/device/metal/bvh.h
+++ b/intern/cycles/kernel/device/metal/bvh.h
@@ -172,17 +172,14 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
    kernel_assert(!"Invalid ift_local");
    return false;
  }
-#  endif
-
-  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
-  metalrt_intersector_type metalrt_intersect;
-
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-
-  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
-  if (triangle_only) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  if (is_null_intersection_function_table(metal_ancillaries->ift_local_prim)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid ift_local_prim");
+    return false;
  }
+#  endif

  MetalRTIntersectionLocalPayload payload;
  payload.self = ray->self;
@@ -195,14 +192,48 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
  }
  payload.result = false;

-  typename metalrt_intersector_type::result_type intersection;
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);

 #  if defined(__METALRT_MOTION__)
+  metalrt_intersector_type metalrt_intersect;
+  typename metalrt_intersector_type::result_type intersection;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
+  if (triangle_only) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
  intersection = metalrt_intersect.intersect(
      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
 #  else
+
+  metalrt_blas_intersector_type metalrt_intersect;
+  typename metalrt_blas_intersector_type::result_type intersection;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
+  if (triangle_only) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  // if we know we are going to get max one hit, like for random-sss-walk we can
+  // optimize and accept the first hit
+  if (max_hits == 1) {
+    metalrt_intersect.accept_any_intersection(true);
+  }
+
+  int blas_index = metal_ancillaries->blas_userID_to_index_lookUp[local_object];
+  // transform the ray into object's local space
+  Transform itfm = kernel_data_fetch(objects, local_object).itfm;
+  r.origin = transform_point(&itfm, r.origin);
+  r.direction = transform_direction(&itfm, r.direction);
+
  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
+      r,
+      metal_ancillaries->blas_accel_structs[blas_index].blas,
+      metal_ancillaries->ift_local_prim,
+      payload);
 #  endif

  if (lcg_state) {
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -105,10 +105,11 @@ struct kernel_gpu_##name \
 { \
  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
  void run(thread MetalKernelContext& context, \
-           threadgroup int *simdgroup_offset, \
+           threadgroup atomic_int *threadgroup_array, \
           const uint metal_global_id, \
           const ushort metal_local_id, \
           const ushort metal_local_size, \
+           const ushort metal_grid_id, \
           uint simdgroup_size, \
           uint simd_lane_index, \
           uint simd_group_index, \
@@ -117,22 +118,24 @@ struct kernel_gpu_##name \
 kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
                                constant MetalAncillaries *_metal_ancillaries, \
-                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]], \
                                const uint metal_global_id [[thread_position_in_grid]], \
                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                const ushort metal_grid_id    [[threadgroup_position_in_grid]], \
                                uint simdgroup_size [[threads_per_simdgroup]], \
                                uint simd_lane_index [[thread_index_in_simdgroup]], \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+  params_struct->run(context, threadgroup_array, metal_global_id, metal_local_id, metal_local_size, metal_grid_id, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
 } \
 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
-                  threadgroup int *simdgroup_offset, \
+                  threadgroup atomic_int *threadgroup_array, \
                  const uint metal_global_id, \
                  const ushort metal_local_id, \
                  const ushort metal_local_size, \
+                  const ushort metal_grid_id, \
                  uint simdgroup_size, \
                  uint simd_lane_index, \
                  uint simd_group_index, \
@@ -263,13 +266,25 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,

 #  if defined(__METALRT_MOTION__)
 #    define METALRT_TAGS instancing, instance_motion, primitive_motion
+#    define METALRT_BLAS_TAGS , primitive_motion
 #  else
 #    define METALRT_TAGS instancing
+#    define METALRT_BLAS_TAGS
 #  endif /* __METALRT_MOTION__ */

 typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
 typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
 typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
+#  if defined(__METALRT_MOTION__)
+typedef acceleration_structure<primitive_motion> metalrt_blas_as_type;
+typedef intersection_function_table<triangle_data, primitive_motion> metalrt_blas_ift_type;
+typedef metal::raytracing::intersector<triangle_data, primitive_motion>
+    metalrt_blas_intersector_type;
+#  else
+typedef acceleration_structure<> metalrt_blas_as_type;
+typedef intersection_function_table<triangle_data> metalrt_blas_ift_type;
+typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_type;
+#  endif

 #endif /* __METALRT__ */

@@ -282,6 +297,12 @@ struct Texture3DParamsMetal {
  texture3d<float, access::sample> tex;
 };

+#ifdef __METALRT__
+struct MetalRTBlasWrapper {
+  metalrt_blas_as_type blas;
+};
+#endif
+
 struct MetalAncillaries {
  device Texture2DParamsMetal *textures_2d;
  device Texture3DParamsMetal *textures_3d;
@@ -291,6 +312,9 @@ struct MetalAncillaries {
  metalrt_ift_type ift_default;
  metalrt_ift_type ift_shadow;
  metalrt_ift_type ift_local;
+  metalrt_blas_ift_type ift_local_prim;
+  constant MetalRTBlasWrapper *blas_accel_structs;
+  constant int *blas_userID_to_index_lookUp;
 #endif
 };

--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -139,6 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
 #endif
 }

+[[intersection(triangle, triangle_data )]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_local_hit_tri_prim(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
+    uint primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
+{
+  //instance_id, aka the user_id has been removed. If we take this function we optimized the
+  //SSS for starting traversal from a primitive acceleration structure instead of the root of the global AS.
+  //this means we will always be intersecting the correct object no need for the userid to check
+  return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+      launch_params_metal, payload, payload.local_object, primitive_id, barycentrics, ray_tmax);
+}
 [[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
 __anyhit__cycles_metalrt_local_hit_tri(
    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
@@ -163,6 +177,17 @@ __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
  return result;
 }

+[[intersection(bounding_box, triangle_data )]] BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_local_hit_box_prim(const float ray_tmax [[max_distance]])
+{
+  /* unused function */
+  BoundingBoxIntersectionResult result;
+  result.distance = ray_tmax;
+  result.accept = false;
+  result.continue_search = false;
+  return result;
+}
+
 template<uint intersection_type>
 bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -372,6 +372,16 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
          break;
        }
+        case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
+          break;
+        }
        case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
          oneapi_call(kg,
                      cgh,
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -132,6 +132,9 @@ typedef struct IntegratorStateGPU {
  /* Index of main path which will be used by a next shadow catcher split.  */
  ccl_global int *next_main_path_index;

+  /* Partition/key offsets used when writing sorted active indices. */
+  ccl_global int *sort_partition_key_offsets;
+
  /* Divisor used to partition active indices by locality when sorting by material.  */
  uint sort_partition_divisor;
 } IntegratorStateGPU;
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -115,6 +115,13 @@ ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

@@ -130,6 +137,13 @@ ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

--- a/intern/cycles/kernel/osl/closures_setup.h
+++ b/intern/cycles/kernel/osl/closures_setup.h
@@ -209,14 +209,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
      closure->distribution == make_string("default", 4430693559278735917ull)) {
    if (!closure->refract) {
-      if (closure->alpha_x == closure->alpha_y) {
-        /* Isotropic */
-        sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
-      }
-      else {
-        /* Anisotropic */
-        sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
-      }
+      sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
    }
    else {
      sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
@@ -225,14 +218,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  /* Beckmann */
  else {
    if (!closure->refract) {
-      if (closure->alpha_x == closure->alpha_y) {
-        /* Isotropic */
-        sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
-      }
-      else {
-        /* Anisotropic */
-        sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
-      }
+      sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
    }
    else {
      sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
@@ -258,9 +244,9 @@ ccl_device void osl_closure_microfacet_ggx_setup(
  }

  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
-  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;

-  sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
+  sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 }

 ccl_device void osl_closure_microfacet_ggx_aniso_setup(
@@ -652,9 +638,9 @@ ccl_device void osl_closure_microfacet_beckmann_setup(
  }

  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
-  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;

-  sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
+  sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 }

 ccl_device void osl_closure_microfacet_beckmann_aniso_setup(
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -74,7 +74,8 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME__

 /* TODO: solve internal compiler errors and enable light tree on HIP. */
-#ifdef __KERNEL_HIP__
+/* TODO: solve internal compiler perf issue and enable light tree on Metal/AMD. */
+#if defined(__KERNEL_HIP__) || defined(__KERNEL_METAL_AMD__)
 #  undef __LIGHT_TREE__
 #endif

@@ -1508,6 +1509,8 @@ typedef enum DeviceKernel : int {
  DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
+  DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -13,6 +13,7 @@
 #include "scene/light.h"
 #include "scene/mesh.h"
 #include "scene/object.h"
+#include "scene/osl.h"
 #include "scene/pointcloud.h"
 #include "scene/scene.h"
 #include "scene/shader.h"
@@ -25,7 +26,6 @@

 #ifdef WITH_OSL
 #  include "kernel/osl/globals.h"
-#  include "kernel/osl/services.h"
 #endif

 #include "util/foreach.h"
@@ -1717,20 +1717,7 @@ void GeometryManager::device_update_displacement_images(Device *device,
  /* If any OSL node is used for displacement, it may reference a texture. But it's
   * unknown which ones, so have to load them all. */
  if (has_osl_node) {
-    set<OSLRenderServices *> services_shared;
-    device->foreach_device([&services_shared](Device *sub_device) {
-      OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
-      services_shared.insert(og->services);
-    });
-
-    for (OSLRenderServices *services : services_shared) {
-      for (auto it = services->textures.begin(); it != services->textures.end(); ++it) {
-        if (it->second->handle.get_manager() == image_manager) {
-          const int slot = it->second->handle.svm_slot();
-          bump_images.insert(slot);
-        }
-      }
-    }
+    OSLShaderManager::osl_image_slots(device, image_manager, bump_images);
  }
 #endif

--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -665,6 +665,27 @@ OSLNode *OSLShaderManager::osl_node(ShaderGraph *graph,
  return node;
 }

+/* Static function, so only this file needs to be compile with RTTT. */
+void OSLShaderManager::osl_image_slots(Device *device,
+                                       ImageManager *image_manager,
+                                       set<int> &image_slots)
+{
+  set<OSLRenderServices *> services_shared;
+  device->foreach_device([&services_shared](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    services_shared.insert(og->services);
+  });
+
+  for (OSLRenderServices *services : services_shared) {
+    for (auto it = services->textures.begin(); it != services->textures.end(); ++it) {
+      if (it->second->handle.get_manager() == image_manager) {
+        const int slot = it->second->handle.svm_slot();
+        image_slots.insert(slot);
+      }
+    }
+  }
+}
+
 /* Graph Compiler */

 OSLCompiler::OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *ss, Scene *scene)
--- a/intern/cycles/scene/osl.h
+++ b/intern/cycles/scene/osl.h
@@ -92,6 +92,9 @@ class OSLShaderManager : public ShaderManager {
                           const std::string &bytecode_hash = "",
                           const std::string &bytecode = "");

+  /* Get image slots used by OSL services on device. */
+  static void osl_image_slots(Device *device, ImageManager *image_manager, set<int> &image_slots);
+
 private:
  void texture_system_init();
  void texture_system_free();
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@@ -73,16 +73,55 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_s
  return new_value.float_value;
 }

-#    define atomic_fetch_and_add_uint32(p, x) \
-      atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
-#    define atomic_fetch_and_sub_uint32(p, x) \
-      atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
-#    define atomic_fetch_and_inc_uint32(p) \
-      atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
-#    define atomic_fetch_and_dec_uint32(p) \
-      atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
-#    define atomic_fetch_and_or_uint32(p, x) \
-      atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(device T *p, int x)
+{
+  return atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(device T *p, int x)
+{
+  return atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(device T *p)
+{
+  return atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(device T *p)
+{
+  return atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(device T *p, int x)
+{
+  return atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(threadgroup T *p)
+{
+  return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(threadgroup T *p)
+{
+  return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_or_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}

 ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
                                                      const float old_val,
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@@ -69,6 +69,9 @@ void DebugFlags::Metal::reset()
 {
  if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
    adaptive_compile = true;
+
+  if (auto str = getenv("CYCLES_METAL_LOCAL_ATOMIC_SORT"))
+    use_local_atomic_sort = (atoi(str) != 0);
 }

 DebugFlags::OptiX::OptiX()
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@@ -97,6 +97,9 @@ class DebugFlags {

    /* Whether adaptive feature based runtime compile is enabled or not. */
    bool adaptive_compile = false;
+
+    /* Whether local atomic sorting is enabled or not. */
+    bool use_local_atomic_sort = true;
  };

  /* Get instance of debug flags registry. */
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@@ -74,7 +74,7 @@ ccl_device float fast_sinf(float x)
   *
   * Results on: [-2pi,2pi].
   *
-   * Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
+   * Examined 2173837240 values of sin: 0.00662760244 avg ULP diff, 2 max ULP,
   * 1.19209e-07 max error
   */
  int q = fast_rint(x * M_1_PI_F);
@@ -256,11 +256,11 @@ ccl_device float fast_acosf(float x)
  /* clamp and crush denormals. */
  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
  /* Based on http://www.pouet.net/topic.php?which=9132&page=2
-   * 85% accurate (ulp 0)
+   * 85% accurate (ULP 0)
   * Examined 2130706434 values of acos:
-   *   15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
+   *   15.2000597 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // without "denormal crush"
   * Examined 2130706434 values of acos:
-   *   15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
+   *   15.2007108 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // with "denormal crush"
   */
  const float a = sqrtf(1.0f - m) *
                  (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
@@ -270,9 +270,8 @@ ccl_device float fast_acosf(float x)
 ccl_device float fast_asinf(float x)
 {
  /* Based on acosf approximation above.
-   * Max error is 4.51133e-05 (ulps are higher because we are consistently off
-   * by a little amount).
-   */
+   * Max error is 4.51133e-05 (ULPS are higher because we are consistently off
+   * by a little amount). */
  const float f = fabsf(x);
  /* Clamp and crush denormals. */
  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
@@ -290,9 +289,9 @@ ccl_device float fast_atanf(float x)
  const float t = s * s;
  /* http://mathforum.org/library/drmath/view/62672.html
   * Examined 4278190080 values of atan:
-   *   2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error      // (with  denormals)
+   *   2.36864877 avg ULP diff, 302 max ULP, 6.55651e-06 max error      // (with  denormals)
   * Examined 4278190080 values of atan:
-   *   171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
+   *   171160502 avg ULP diff, 855638016 max ULP, 6.55651e-06 max error // (crush denormals)
   */
  float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
  if (a > 1.0f) {
@@ -343,8 +342,8 @@ ccl_device float fast_log2f(float x)
  int exponent = (int)(bits >> 23) - 127;
  float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
  /* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
-   * 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error.
-   * ulp histogram:
+   * 0.0797524457 avg ULP diff, 3713596 max ULP, 7.62939e-06 max error.
+   * ULP histogram:
   *  0  = 97.46%
   *  1  =  2.29%
   *  2  =  0.11%
@@ -363,7 +362,7 @@ ccl_device float fast_log2f(float x)
 ccl_device_inline float fast_logf(float x)
 {
  /* Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]:
-   * 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error.
+   * 0.313865375 avg ULP diff, 5148137 max ULP, 7.62939e-06 max error.
   */
  return fast_log2f(x) * M_LN2_F;
 }
@@ -371,7 +370,7 @@ ccl_device_inline float fast_logf(float x)
 ccl_device_inline float fast_log10(float x)
 {
  /* Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]:
-   * 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error.
+   * 0.631237033 avg ULP diff, 4471615 max ULP, 3.8147e-06 max error.
   */
  return fast_log2f(x) * M_LN2_F / M_LN10_F;
 }
@@ -392,12 +391,12 @@ ccl_device float fast_exp2f(float x)
  /* Range reduction. */
  int m = (int)x;
  x -= m;
-  x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ulps!). */
+  x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ULPS!). */
  /* 5th degree polynomial generated with sollya
-   * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff,
-   * 232 max ulp.
+   * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ULP diff,
+   * 232 max ULP.
   *
-   * ulp histogram:
+   * ULP histogram:
   *  0  = 87.81%
   *  1  =  4.18%
   */
@@ -415,7 +414,7 @@ ccl_device float fast_exp2f(float x)
 ccl_device_inline float fast_expf(float x)
 {
  /* Examined 2237485550 values of exp on [-87.3300018,87.3300018]:
-   * 2.6666452 avg ulp diff, 230 max ulp.
+   * 2.6666452 avg ULP diff, 230 max ULP.
   */
  return fast_exp2f(x / M_LN2_F);
 }
@@ -454,7 +453,7 @@ ccl_device_inline float4 fast_expf4(float4 x)
 ccl_device_inline float fast_exp10(float x)
 {
  /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
-   * 2.71732409 avg ulp diff, 232 max ulp.
+   * 2.71732409 avg ULP diff, 232 max ULP.
   */
  return fast_exp2f(x * M_LN10_F / M_LN2_F);
 }
@@ -475,7 +474,7 @@ ccl_device float fast_sinhf(float x)
  float a = fabsf(x);
  if (a > 1.0f) {
    /* Examined 53389559 values of sinh on [1,87.3300018]:
-     * 33.6886442 avg ulp diff, 178 max ulp. */
+     * 33.6886442 avg ULP diff, 178 max ULP. */
    float e = fast_expf(a);
    return copysignf(0.5f * e - 0.5f / e, x);
  }
@@ -495,7 +494,7 @@ ccl_device float fast_sinhf(float x)
 ccl_device_inline float fast_coshf(float x)
 {
  /* Examined 2237485550 values of cosh on [-87.3300018,87.3300018]:
-   * 1.78256726 avg ulp diff, 178 max ulp.
+   * 1.78256726 avg ULP diff, 178 max ULP.
   */
  float e = fast_expf(fabsf(x));
  return 0.5f * e + 0.5f / e;
@@ -506,7 +505,7 @@ ccl_device_inline float fast_tanhf(float x)
  /* Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]:
   * 3.12924e-06 max error.
   */
-  /* NOTE: ulp error is high because of sub-optimal handling around the origin. */
+  /* NOTE: ULP error is high because of sub-optimal handling around the origin. */
  float e = fast_expf(2.0f * fabsf(x));
  return copysignf(1.0f - 2.0f / (1.0f + e), x);
 }
@@ -579,7 +578,7 @@ ccl_device_inline float fast_erfcf(float x)
 {
  /* Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error.
   *
-   * ulp histogram:
+   * ULP histogram:
   *
   *  0  = 80.30%
   */
--- a/intern/ghost/intern/GHOST_SystemSDL.cpp
+++ b/intern/ghost/intern/GHOST_SystemSDL.cpp
@@ -602,7 +602,7 @@ void GHOST_SystemSDL::processEvent(SDL_Event *sdl_event)
      /* NOTE: the `sdl_sub_evt.keysym.sym` is truncated,
       * for unicode support ghost has to be modified. */

-      /* TODO(@campbellbarton): support full unicode, SDL supports this but it needs to be
+      /* TODO(@ideasman42): support full unicode, SDL supports this but it needs to be
       * explicitly enabled via #SDL_StartTextInput which GHOST would have to wrap. */
      char utf8_buf[sizeof(GHOST_TEventKeyData::utf8_buf)] = {'\0'};
      if (type == GHOST_kEventKeyDown) {
--- a/intern/ghost/intern/GHOST_SystemWayland.cpp
+++ b/intern/ghost/intern/GHOST_SystemWayland.cpp
@@ -82,6 +82,8 @@
 #include "CLG_log.h"

 #ifdef USE_EVENT_BACKGROUND_THREAD
+#  include "GHOST_TimerTask.h"
+
 #  include <pthread.h>
 #endif

@@ -239,7 +241,7 @@ enum {
  BTN_STYLUS = 0x14b,
  /** Use as right-mouse. */
  BTN_STYLUS2 = 0x14c,
-  /** NOTE(@campbellbarton): Map to an additional button (not sure which hardware uses this). */
+  /** NOTE(@ideasman42): Map to an additional button (not sure which hardware uses this). */
  BTN_STYLUS3 = 0x149,
 };

@@ -768,7 +770,12 @@ struct GWL_Seat {
    int32_t rate = 0;
    /** Time (milliseconds) after which to start repeating keys. */
    int32_t delay = 0;
-    /** Timer for key repeats. */
+    /**
+     * Timer for key repeats.
+     *
+     * \note For as long as #USE_EVENT_BACKGROUND_THREAD is defined, any access to this
+     * (including null checks, must lock `timer_mutex` first.
+     */
    GHOST_ITimerTask *timer = nullptr;
  } key_repeat;

@@ -832,6 +839,42 @@ static bool gwl_seat_key_depressed_suppress_warning(const GWL_Seat *seat)
  return suppress_warning;
 }

+/**
+ * \note Caller must lock `timer_mutex`.
+ */
+static void gwl_seat_key_repeat_timer_add(GWL_Seat *seat,
+                                          GHOST_TimerProcPtr key_repeat_fn,
+                                          GHOST_TUserDataPtr payload,
+                                          const bool use_delay)
+{
+  GHOST_SystemWayland *system = seat->system;
+  const uint64_t time_step = 1000 / seat->key_repeat.rate;
+  const uint64_t time_start = use_delay ? seat->key_repeat.delay : time_step;
+#ifdef USE_EVENT_BACKGROUND_THREAD
+  GHOST_TimerTask *timer = new GHOST_TimerTask(
+      system->getMilliSeconds() + time_start, time_step, key_repeat_fn, payload);
+  seat->key_repeat.timer = timer;
+  system->ghost_timer_manager()->addTimer(timer);
+#else
+  seat->key_repeat.timer = system->installTimer(time_start, time_step, key_repeat_fn, payload);
+#endif
+}
+
+/**
+ * \note The caller must lock `timer_mutex`.
+ */
+static void gwl_seat_key_repeat_timer_remove(GWL_Seat *seat)
+{
+  GHOST_SystemWayland *system = seat->system;
+#ifdef USE_EVENT_BACKGROUND_THREAD
+  system->ghost_timer_manager()->removeTimer(
+      static_cast<GHOST_TimerTask *>(seat->key_repeat.timer));
+#else
+  system->removeTimer(seat->key_repeat.timer);
+#endif
+  seat->key_repeat.timer = nullptr;
+}
+
 /** \} */

 /* -------------------------------------------------------------------- */
@@ -873,7 +916,7 @@ struct GWL_Display {
   * The main purpose of having an active seat is an alternative from always using the first
   * seat which prevents events from any other seat.
   *
-   * NOTE(@campbellbarton): This could be extended and developed further extended to support
+   * NOTE(@ideasman42): This could be extended and developed further extended to support
   * an active seat per window (for e.g.), basic support is sufficient for now as currently isn't
   * a widely used feature.
   */
@@ -906,6 +949,16 @@ struct GWL_Display {
  /** Guard against multiple threads accessing `events_pending` at once. */
  std::mutex events_pending_mutex;

+  /**
+   * A separate timer queue, needed so the WAYLAND thread can lock access.
+   * Using the system's #GHOST_Sysem::getTimerManager is not thread safe because
+   * access to the timer outside of WAYLAND specific logic will not lock.
+   *
+   * Needed because #GHOST_System::dispatchEvents fires timers
+   * outside of WAYLAND (without locking the `timer_mutex`).
+   */
+  GHOST_TimerManager *ghost_timer_manager = nullptr;
+
 #endif /* USE_EVENT_BACKGROUND_THREAD */
 };

@@ -959,6 +1012,13 @@ static void gwl_display_destroy(GWL_Display *display)
    gwl_display_event_thread_destroy(display);
    display->system->server_mutex->unlock();
  }
+
+  /* Important to remove after the seats which may have key repeat timers active. */
+  if (display->ghost_timer_manager) {
+    delete display->ghost_timer_manager;
+    display->ghost_timer_manager = nullptr;
+  }
+
 #endif /* USE_EVENT_BACKGROUND_THREAD */

  if (display->wl_display) {
@@ -1179,7 +1239,7 @@ static void gwl_registry_entry_remove_all(GWL_Display *display)
 {
  const bool on_exit = true;

-  /* NOTE(@campbellbarton): Free by slot instead of simply looping over
+  /* NOTE(@ideasman42): Free by slot instead of simply looping over
   * `display->registry_entry` so the order of freeing is always predictable.
   * Otherwise global objects would be feed in the order they are registered.
   * While this works in my tests, it could cause difficult to reproduce bugs
@@ -1209,7 +1269,7 @@ static void gwl_registry_entry_remove_all(GWL_Display *display)
 * so there is no reason to update all other outputs that an output was removed (for e.g.).
 * Pass as -1 to update all slots.
 *
- * NOTE(@campbellbarton): Updating all other items on a single change is typically worth avoiding.
+ * NOTE(@ideasman42): Updating all other items on a single change is typically worth avoiding.
 * In practice this isn't a problem as so there are so few elements in `display->registry_entry`,
 * so few use update functions and adding/removal at runtime is rarely called (plugging/unplugging)
 * hardware for e.g. So while it's possible to store dependency links to avoid unnecessary
@@ -1258,7 +1318,7 @@ static void ghost_wl_display_report_error(struct wl_display *display)
    fprintf(stderr, "The Wayland connection experienced a fatal error: %s\n", strerror(ecode));
  }

-  /* NOTE(@campbellbarton): The application is running,
+  /* NOTE(@ideasman42): The application is running,
   * however an error closes all windows and most importantly:
   * shuts down the GPU context (loosing all GPU state - shaders, bind codes etc),
   * so recovering from this effectively involves restarting.
@@ -2910,7 +2970,7 @@ static void gesture_pinch_handle_begin(void *data,
  if (wl_surface *wl_surface_focus = seat->pointer.wl_surface_window) {
    win = ghost_wl_surface_user_data(wl_surface_focus);
  }
-  /* NOTE(@campbellbarton): Blender's use of track-pad coordinates is inconsistent and needs work.
+  /* NOTE(@ideasman42): Blender's use of track-pad coordinates is inconsistent and needs work.
   * This isn't specific to WAYLAND, in practice they tend to work well enough in most cases.
   * Some operators scale by the UI scale, some don't.
   * Even this window scale is not correct because it doesn't account for:
@@ -2924,7 +2984,7 @@ static void gesture_pinch_handle_begin(void *data,
   */
  const wl_fixed_t win_scale = win ? win->scale() : 1;

-  /* NOTE(@campbellbarton): Scale factors match Blender's operators & default preferences.
+  /* NOTE(@ideasman42): Scale factors match Blender's operators & default preferences.
   * For these values to work correctly, operator logic will need to be changed not to scale input
   * by the region size (as with 3D view zoom) or preference for 3D view orbit sensitivity.
   *
@@ -3087,7 +3147,7 @@ static const struct zwp_pointer_gesture_swipe_v1_listener gesture_swipe_listener
 /* -------------------------------------------------------------------- */
 /** \name Listener (Touch Seat), #wl_touch_listener
 *
- * NOTE(@campbellbarton): It's not clear if this interface is used by popular compositors.
+ * NOTE(@ideasman42): It's not clear if this interface is used by popular compositors.
 * It looks like GNOME/KDE only support `zwp_pointer_gestures_v1_interface`.
 * If this isn't used anywhere, it could be removed.
 * \{ */
@@ -3718,9 +3778,14 @@ static void keyboard_handle_leave(void *data,
  GWL_Seat *seat = static_cast<GWL_Seat *>(data);
  seat->keyboard.wl_surface_window = nullptr;

-  /* Losing focus must stop repeating text. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_cancel(seat);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    /* Losing focus must stop repeating text. */
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_cancel(seat);
+    }
  }

 #ifdef USE_GNOME_KEYBOARD_SUPPRESS_WARNING
@@ -3743,7 +3808,7 @@ static xkb_keysym_t xkb_state_key_get_one_sym_without_modifiers(
  /* Use an empty keyboard state to access key symbol without modifiers. */
  xkb_keysym_t sym = xkb_state_key_get_one_sym(xkb_state_empty, key);

-  /* NOTE(@campbellbarton): Only perform the number-locked lookup as a fallback
+  /* NOTE(@ideasman42): Only perform the number-locked lookup as a fallback
   * when a number-pad key has been pressed. This is important as some key-maps use number lock
   * for switching other layers (in particular `de(neo_qwertz)` turns on layer-4), see: T96170.
   * Alternative solutions could be to inspect the layout however this could get involved
@@ -3780,36 +3845,32 @@ static xkb_keysym_t xkb_state_key_get_one_sym_without_modifiers(
  return sym;
 }

+/**
+ * \note Caller must lock `timer_mutex`.
+ */
 static void keyboard_handle_key_repeat_cancel(GWL_Seat *seat)
 {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
  GHOST_ASSERT(seat->key_repeat.timer != nullptr, "Caller much check for timer");
  delete static_cast<GWL_KeyRepeatPlayload *>(seat->key_repeat.timer->getUserData());
-  seat->system->removeTimer(seat->key_repeat.timer);
-  seat->key_repeat.timer = nullptr;
+
+  gwl_seat_key_repeat_timer_remove(seat);
 }

 /**
 * Restart the key-repeat timer.
 * \param use_delay: When false, use the interval
 * (prevents pause when the setting changes while the key is held).
+ *
+ * \note Caller must lock `timer_mutex`.
 */
 static void keyboard_handle_key_repeat_reset(GWL_Seat *seat, const bool use_delay)
 {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
  GHOST_ASSERT(seat->key_repeat.timer != nullptr, "Caller much check for timer");
-  GHOST_SystemWayland *system = seat->system;
-  GHOST_ITimerTask *timer = seat->key_repeat.timer;
-  GHOST_TimerProcPtr key_repeat_fn = timer->getTimerProc();
+  GHOST_TimerProcPtr key_repeat_fn = seat->key_repeat.timer->getTimerProc();
  GHOST_TUserDataPtr payload = seat->key_repeat.timer->getUserData();
-  seat->system->removeTimer(seat->key_repeat.timer);
-  const uint64_t time_step = 1000 / seat->key_repeat.rate;
-  const uint64_t time_start = use_delay ? seat->key_repeat.delay : time_step;
-  seat->key_repeat.timer = system->installTimer(time_start, time_step, key_repeat_fn, payload);
+
+  gwl_seat_key_repeat_timer_remove(seat);
+  gwl_seat_key_repeat_timer_add(seat, key_repeat_fn, payload, use_delay);
 }

 static void keyboard_handle_key(void *data,
@@ -3848,6 +3909,11 @@ static void keyboard_handle_key(void *data,
      break;
  }

+#ifdef USE_EVENT_BACKGROUND_THREAD
+  /* Any access to `seat->key_repeat.timer` must lock. */
+  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+
  struct GWL_KeyRepeatPlayload *key_repeat_payload = nullptr;

  /* Delete previous timer. */
@@ -3872,7 +3938,7 @@ static void keyboard_handle_key(void *data,
      else {
        /* Key-up from keys that were not repeating cause the repeat timer to pause.
         *
-         * NOTE(@campbellbarton): This behavior isn't universal, some text input systems will
+         * NOTE(@ideasman42): This behavior isn't universal, some text input systems will
         * stop the repeat entirely. Choose to pause repeat instead as this is what GTK/WIN32 do,
         * and it fits better for keyboard input that isn't related to text entry. */
        timer_action = RESET;
@@ -3886,23 +3952,14 @@ static void keyboard_handle_key(void *data,
        break;
      }
      case RESET: {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-        std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
        /* The payload will be added again. */
-        seat->system->removeTimer(seat->key_repeat.timer);
-        seat->key_repeat.timer = nullptr;
+        gwl_seat_key_repeat_timer_remove(seat);
        break;
      }
      case CANCEL: {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-        std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
        delete key_repeat_payload;
        key_repeat_payload = nullptr;
-
-        seat->system->removeTimer(seat->key_repeat.timer);
-        seat->key_repeat.timer = nullptr;
+        gwl_seat_key_repeat_timer_remove(seat);
        break;
      }
    }
@@ -3956,8 +4013,8 @@ static void keyboard_handle_key(void *data,
                                                           utf8_buf));
      }
    };
-    seat->key_repeat.timer = seat->system->installTimer(
-        seat->key_repeat.delay, 1000 / seat->key_repeat.rate, key_repeat_fn, key_repeat_payload);
+
+    gwl_seat_key_repeat_timer_add(seat, key_repeat_fn, key_repeat_payload, true);
  }
 }

@@ -3982,8 +4039,13 @@ static void keyboard_handle_modifiers(void *data,

  /* A modifier changed so reset the timer,
   * see comment in #keyboard_handle_key regarding this behavior. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_reset(seat, true);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_reset(seat, true);
+    }
  }

 #ifdef USE_GNOME_KEYBOARD_SUPPRESS_WARNING
@@ -4002,9 +4064,14 @@ static void keyboard_repeat_handle_info(void *data,
  seat->key_repeat.rate = rate;
  seat->key_repeat.delay = delay;

-  /* Unlikely possible this setting changes while repeating. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_reset(seat, false);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    /* Unlikely possible this setting changes while repeating. */
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_reset(seat, false);
+    }
  }
 }

@@ -4275,8 +4342,14 @@ static void gwl_seat_capability_keyboard_disable(GWL_Seat *seat)
  if (!seat->wl_keyboard) {
    return;
  }
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_cancel(seat);
+
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_cancel(seat);
+    }
  }
  wl_keyboard_destroy(seat->wl_keyboard);
  seat->wl_keyboard = nullptr;
@@ -5411,6 +5484,8 @@ GHOST_SystemWayland::GHOST_SystemWayland(bool background)

 #ifdef USE_EVENT_BACKGROUND_THREAD
  gwl_display_event_thread_create(display_);
+
+  display_->ghost_timer_manager = new GHOST_TimerManager();
 #endif
 }

@@ -5491,10 +5566,16 @@ bool GHOST_SystemWayland::processEvents(bool waitForEvent)
 #endif /* USE_EVENT_BACKGROUND_THREAD */

  {
+    const uint64_t now = getMilliSeconds();
 #ifdef USE_EVENT_BACKGROUND_THREAD
-    std::lock_guard lock_timer_guard{*display_->system->timer_mutex};
+    {
+      std::lock_guard lock_timer_guard{*display_->system->timer_mutex};
+      if (ghost_timer_manager()->fireTimers(now)) {
+        any_processed = true;
+      }
+    }
 #endif
-    if (getTimerManager()->fireTimers(getMilliSeconds())) {
+    if (getTimerManager()->fireTimers(now)) {
      any_processed = true;
    }
  }
@@ -6717,6 +6798,13 @@ struct wl_shm *GHOST_SystemWayland::wl_shm() const
  return display_->wl_shm;
 }

+#ifdef USE_EVENT_BACKGROUND_THREAD
+GHOST_TimerManager *GHOST_SystemWayland::ghost_timer_manager()
+{
+  return display_->ghost_timer_manager;
+}
+#endif
+
 /** \} */

 /* -------------------------------------------------------------------- */
@@ -6949,7 +7037,7 @@ bool GHOST_SystemWayland::window_cursor_grab_set(const GHOST_TGrabCursorMode mod
                                                         UNPACK2(xy_next));
          wl_surface_commit(wl_surface);

-          /* NOTE(@campbellbarton): The new cursor position is a hint,
+          /* NOTE(@ideasman42): The new cursor position is a hint,
           * it's possible the hint is ignored. It doesn't seem like there is a good way to
           * know if the hint will be used or not, at least not immediately. */
          xy_motion[0] = xy_next[0];
@@ -6992,7 +7080,7 @@ bool GHOST_SystemWayland::window_cursor_grab_set(const GHOST_TGrabCursorMode mod
  if (mode != GHOST_kGrabDisable) {
    if (grab_state_next.use_lock) {
      if (!grab_state_prev.use_lock) {
-        /* TODO(@campbellbarton): As WAYLAND does not support warping the pointer it may not be
+        /* TODO(@ideasman42): As WAYLAND does not support warping the pointer it may not be
         * possible to support #GHOST_kGrabWrap by pragmatically settings it's coordinates.
         * An alternative could be to draw the cursor in software (and hide the real cursor),
         * or just accept a locked cursor on WAYLAND. */
--- a/intern/ghost/intern/GHOST_SystemWayland.h
+++ b/intern/ghost/intern/GHOST_SystemWayland.h
@@ -165,6 +165,16 @@ class GHOST_SystemWayland : public GHOST_System {

  bool cursor_grab_use_software_display_get(const GHOST_TGrabCursorMode mode);

+#ifdef USE_EVENT_BACKGROUND_THREAD
+  /**
+   * Return a separate WAYLAND local timer manager to #GHOST_System::getTimerManager
+   * Manipulation & access must lock with #GHOST_WaylandSystem::server_mutex.
+   *
+   * See #GWL_Display::ghost_timer_manager doc-string for details on why this is needed.
+   */
+  GHOST_TimerManager *ghost_timer_manager();
+#endif
+
  /* WAYLAND direct-data access. */

  struct wl_display *wl_display();
@@ -233,7 +243,14 @@ class GHOST_SystemWayland : public GHOST_System {
   * from running at the same time. */
  std::mutex *server_mutex = nullptr;

-  /** Threads must lock this before manipulating timers. */
+  /**
+   * Threads must lock this before manipulating #GWL_Display::ghost_timer_manager.
+   *
+   * \note Using a separate lock to `server_mutex` is necessary because the
+   * server lock is already held when calling `ghost_wl_display_event_pump`.
+   * If manipulating the timer used the `server_mutex`, event pump can indirectly
+   * handle key up/down events which would lock `server_mutex` causing a dead-lock.
+   */
  std::mutex *timer_mutex = nullptr;

  std::thread::id main_thread_id;
--- a/intern/ghost/intern/GHOST_SystemWin32.cpp
+++ b/intern/ghost/intern/GHOST_SystemWin32.cpp
@@ -1080,7 +1080,7 @@ GHOST_EventCursor *GHOST_SystemWin32::processCursorEvent(GHOST_WindowWin32 *wind
      if (window->getCursorGrabMode() == GHOST_kGrabHide) {
        window->getClientBounds(bounds);

-        /* WARNING(@campbellbarton): The current warping logic fails to warp on every event,
+        /* WARNING(@ideasman42): The current warping logic fails to warp on every event,
         * so the box needs to small enough not to let the cursor escape the window but large
         * enough that the cursor isn't being warped every time.
         * If this was not the case it would be less trouble to simply warp the cursor to the
@@ -1179,7 +1179,7 @@ GHOST_EventKey *GHOST_SystemWin32::processKeyEvent(GHOST_WindowWin32 *window, RA
  GHOST_TKey key = system->hardKey(raw, &key_down);
  GHOST_EventKey *event;

-  /* NOTE(@campbellbarton): key repeat in WIN32 also applies to modifier-keys.
+  /* NOTE(@ideasman42): key repeat in WIN32 also applies to modifier-keys.
   * Check for this case and filter out modifier-repeat.
   * Typically keyboard events are *not* filtered as part of GHOST's event handling.
   * As other GHOST back-ends don't have the behavior, it's simplest not to send them through.
--- a/intern/ghost/intern/GHOST_SystemWin32.h
+++ b/intern/ghost/intern/GHOST_SystemWin32.h
@@ -282,7 +282,7 @@ class GHOST_SystemWin32 : public GHOST_System {
  GHOST_TSuccess exit();

  /**
-   * Converts raw WIN32 key codes from the wndproc to GHOST keys.
+   * Converts raw WIN32 key codes from the `wndproc` to GHOST keys.
   * \param vKey: The virtual key from #hardKey.
   * \param ScanCode: The ScanCode of pressed key (similar to PS/2 Set 1).
   * \param extend: Flag if key is not primly (left or right).
@@ -291,7 +291,7 @@ class GHOST_SystemWin32 : public GHOST_System {
  GHOST_TKey convertKey(short vKey, short ScanCode, short extend) const;

  /**
-   * Catches raw WIN32 key codes from WM_INPUT in the wndproc.
+   * Catches raw WIN32 key codes from WM_INPUT in the `wndproc`.
   * \param raw: RawInput structure with detailed info about the key event.
   * \param r_key_down: Set true when the key is pressed, otherwise false.
   * \return The GHOST key (GHOST_kKeyUnknown if no match).
@@ -319,8 +319,8 @@ class GHOST_SystemWin32 : public GHOST_System {
   * Creates tablet events from pointer events.
   * \param type: The type of pointer event.
   * \param window: The window receiving the event (the active window).
-   * \param wParam: The wParam from the wndproc.
-   * \param lParam: The lParam from the wndproc.
+   * \param wParam: The wParam from the `wndproc`.
+   * \param lParam: The lParam from the `wndproc`.
   * \param eventhandled: True if the method handled the event.
   */
  static void processPointerEvent(
@@ -337,8 +337,8 @@ class GHOST_SystemWin32 : public GHOST_System {
  /**
   * Handles a mouse wheel event.
   * \param window: The window receiving the event (the active window).
-   * \param wParam: The wParam from the wndproc.
-   * \param lParam: The lParam from the wndproc.
+   * \param wParam: The wParam from the `wndproc`.
+   * \param lParam: The lParam from the `wndproc`.
   */
  static void processWheelEvent(GHOST_WindowWin32 *window, WPARAM wParam, LPARAM lParam);

--- a/intern/ghost/intern/GHOST_SystemX11.cpp
+++ b/intern/ghost/intern/GHOST_SystemX11.cpp
@@ -278,7 +278,7 @@ uint8_t GHOST_SystemX11::getNumDisplays() const
 void GHOST_SystemX11::getMainDisplayDimensions(uint32_t &width, uint32_t &height) const
 {
  if (m_display) {
-    /* NOTE(@campbellbarton): for this to work as documented,
+    /* NOTE(@ideasman42): for this to work as documented,
     * we would need to use Xinerama check r54370 for code that did this,
     * we've since removed since its not worth the extra dependency. */
    getAllDisplayDimensions(width, height);
@@ -927,7 +927,7 @@ void GHOST_SystemX11::processEvent(XEvent *xe)
          if (window->getCursorGrabMode() == GHOST_kGrabHide) {
            window->getClientBounds(bounds);

-            /* TODO(@campbellbarton): warp the cursor to `window->getCursorGrabInitPos`,
+            /* TODO(@ideasman42): warp the cursor to `window->getCursorGrabInitPos`,
             * on every motion event, see: D16557 (alternative fix for T102346). */
            const int32_t subregion_div = 4; /* One quarter of the region. */
            const int32_t size[2] = {bounds.getWidth(), bounds.getHeight()};
@@ -2015,8 +2015,8 @@ void GHOST_SystemX11::getClipboard_xcout(
        return;
      }

-      /* if it's not incr, and not format == 8, then there's
-       * nothing in the selection (that xclip understands, anyway) */
+      /* If it's not INCR, and not `format == 8`, then there's
+       * nothing in the selection (that `xclip` understands, anyway). */

      if (pty_format != 8) {
        *context = XCLIB_XCOUT_NONE;
--- a/intern/ghost/intern/GHOST_WindowWayland.cpp
+++ b/intern/ghost/intern/GHOST_WindowWayland.cpp
@@ -669,7 +669,7 @@ static void xdg_surface_handle_configure(void *data,
  GHOST_SystemWayland *system = win->ghost_system;
  const bool is_main_thread = system->main_thread_id == std::this_thread::get_id();
  if (!is_main_thread) {
-    /* NOTE(@campbellbarton): this only gets one redraw,
+    /* NOTE(@ideasman42): this only gets one redraw,
     * I could not find a case where this causes problems. */
    gwl_window_pending_actions_tag(win, PENDING_FRAME_CONFIGURE);
  }
@@ -774,7 +774,7 @@ GHOST_WindowWayland::GHOST_WindowWayland(GHOST_SystemWayland *system,
  window_->ghost_window = this;
  window_->ghost_system = system;

-  /* NOTE(@campbellbarton): The scale set here to avoid flickering on startup.
+  /* NOTE(@ideasman42): The scale set here to avoid flickering on startup.
   * When all monitors use the same scale (which is quite common) there aren't any problems.
   *
   * When monitors have different scales there may still be a visible window resize on startup.
@@ -1078,7 +1078,7 @@ GHOST_WindowWayland::~GHOST_WindowWayland()

  wl_surface_destroy(window_->wl_surface);

-  /* NOTE(@campbellbarton): Flushing will often run the appropriate handlers event
+  /* NOTE(@ideasman42): Flushing will often run the appropriate handlers event
   * (#wl_surface_listener.leave in particular) to avoid attempted access to the freed surfaces.
   * This is not fool-proof though, hence the call to #window_surface_unref, see: T99078. */
  wl_display_flush(system_->wl_display());
--- a/intern/libmv/bundle.sh
+++ b/intern/libmv/bundle.sh
@@ -8,9 +8,9 @@ else
  exit 1
 fi

-BRANCH="master"
+BRANCH="main"

-# repo="git://git.blender.org/libmv.git"
+# repo="https://projects.blender.org/blender/libmv.git"
 repo="/home/sergey/Developer/libmv"
 tmp=`mktemp -d`

--- a/intern/mikktspace/mikk_util.hh
+++ b/intern/mikktspace/mikk_util.hh
@@ -40,11 +40,11 @@ inline float fast_acosf(float x)
  /* clamp and crush denormals. */
  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
  /* Based on http://www.pouet.net/topic.php?which=9132&page=2
-   * 85% accurate (ulp 0)
+   * 85% accurate (ULP 0)
   * Examined 2130706434 values of acos:
-   *   15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
+   *   15.2000597 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // without "denormal crush"
   * Examined 2130706434 values of acos:
-   *   15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
+   *   15.2007108 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // with "denormal crush"
   */
  const float a = sqrtf(1.0f - m) *
                  (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
--- a/intern/opensubdiv/internal/evaluator/eval_output.h
+++ b/intern/opensubdiv/internal/evaluator/eval_output.h
@@ -89,7 +89,7 @@ class EvalOutputAPI::EvalOutput {
  // The following interfaces are dependant on the actual evaluator type (CPU, OpenGL, etc.) which
  // have slightly different APIs to access patch arrays, as well as different types for their
  // data structure. They need to be overridden in the specific instances of the EvalOutput derived
-  // classes if needed, while the interfaces above are overriden through VolatileEvalOutput.
+  // classes if needed, while the interfaces above are overridden through VolatileEvalOutput.

  virtual void fillPatchArraysBuffer(OpenSubdiv_Buffer * /*patch_arrays_buffer*/)
  {
--- a/release/lts/README.md
+++ b/release/lts/README.md
@@ -1,41 +1,18 @@
-This folder contains several scripts to smoothen the Blender LTS releases.
+This folder contains a script to generate release notes and download URLs
+for Blender LTS releases.

-create_download_urls.py
-=======================
+Ensure required Python modules are installed before running:

-This python script is used to generate the download urls which we can
-copy-paste directly into the CMS of www.blender.org.
+    pip3 install -r ./requirements.txt

-Usage: create_download_urls.py --version 2.83.7
+Then run for example:

-Arguments:
-  --version VERSION  Version string in the form of {major}.{minor}.{build}
-                     (eg 2.83.7)
+    ./create_release_notes.py --version 3.3.2 --format=html

-The resulting html will be printed to the console.
+Available arguments:

-create_release_notes.py
-=======================
-
-This python script is used to generate the release notes which we can
-copy-paste directly into the CMS of www.blender.org and stores.
-
-Usage: ./create_release_notes.py --task=T77348 --version=2.83.7
-
-Arguments:
-  --version VERSION  Version string in the form of {major}.{minor}.{build}
-                     (e.g. 2.83.7)
-  --task TASK        Phabricator ticket that is contains the release notes
-                     information (e.g. T77348)
-  --format FORMAT    Format the result in `text`, `steam`, `wiki` or `html`
-
-Requirements
-============
-
-* Python 3.8 or later
-* Python phabricator client version 0.7.0
-  https://pypi.org/project/phabricator/
-
-For convenience the python modules can be installed using pip
-
-    pip3 install -r ./requirements.txt
+    --version VERSION  Version string in the form of {major}.{minor}.{build}
+                       (e.g. 3.3.2)
+    --issue ISSUE      Gitea issue that is contains the release notes
+                       information (e.g. #77348)
+    --format FORMAT    Format the result in `text`, `steam`, `wiki` or `html`
--- a/release/lts/create_release_notes.py
+++ b/release/lts/create_release_notes.py
@@ -1,169 +1,46 @@
+#!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0-or-later

-#!/usr/bin/env python3
-
 import argparse
-import phabricator

+import lts_issue
+import lts_download

-DESCRIPTION = ("This python script is used to generate the release notes "
-               "which we can copy-paste directly into the CMS of "
+DESCRIPTION = ("This python script is used to generate the release notes and "
+               "download URLs which we can copy-paste directly into the CMS of "
               "www.blender.org and stores.")
-USAGE = "./create_release_notes.py --task=T77348 --version=2.83.7"

+# Parse arguments
+parser = argparse.ArgumentParser(description=DESCRIPTION)
+parser.add_argument(
+    "--version",
+    required=True,
+    help="Version string in the form of {major}.{minor}.{patch} (e.g. 3.3.2)")
+parser.add_argument(
+    "--issue",
+    help="Task that is contains the release notes information (e.g. #77348)")
+parser.add_argument(
+    "--format",
+    help="Format the result in `text`, `steam`, `wiki` or `html`",
+    default="text")
+args = parser.parse_args()

-class ReleaseLogLine:
-    """
-    Class containing the information of a single line of the release log
+# Determine issue number
+version = args.version
+issue = args.issue
+if not issue:
+    if version.startswith("2.83."):
+        issue = "#77348"
+    elif version.startswith("2.93."):
+        issue = "#88449"
+    elif version.startswith("3.3."):
+        issue = "#100749"
+    else:
+        raise ValueError("Specify --issue or update script to include issue number for this version")

-    Instance attributes:
+# Print
+if args.format == "html":
+    lts_download.print_urls(version=version)
+    print("")

-    * line: (str) the original line used to create this log line
-    * task_id: (int or None) the extracted task id associated with this log
-               line. Can be None if the log line isn't associated with a task.
-    * commit_id: (str or None) the extracted commit id associated with this log
-               line. Only filled when no `task_id` could be found.
-    * ref: (str) `task_id` or `commit_id` of this line, including `T` for tasks
-            or `D` for diffs.
-    * title: (str) title of this log line. When constructed this attribute is
-            an empty string. The called needs to retrieve the title from the
-            backend.
-    * url: (str) url of the ticket task or commit.
-    """
-
-    def __init__(self, line: str):
-        self.line = line
-        items = line.split("|")
-        self.task_id = None
-        self.commit_id = None
-        try:
-            task_id = int(items[1].strip()[1:])
-            self.task_id = task_id
-            self.ref = f"T{self.task_id}"
-        except ValueError:
-            # no task
-            commit_string = items[3].strip()
-            commits = commit_string.split(",")
-            commit_id = commits[0]
-            commit_id = commit_id.replace("{", "").replace("}", "")
-            if not commit_id.startswith("rB"):
-                commit_id = f"rB{commit_id}"
-            self.commit_id = commit_id
-
-            self.ref = f"{self.commit_id}"
-
-        self.title = ""
-        self.url = f"https://developer.blender.org/{self.ref}"
-
-    def __format_as_html(self) -> str:
-        return f"  <li>{self.title} [<a href=\"{self.url}\">{self.ref}</a>]</li>"
-
-    def __format_as_text(self) -> str:
-        return f"* {self.title} [{self.ref}]"
-
-    def __format_as_steam(self) -> str:
-        return f"* {self.title} ([url={self.url}]{self.ref}[/url])"
-
-    def __format_as_wiki(self) -> str:
-        if self.task_id:
-            return f"* {self.title} [{{{{BugReport|{self.task_id}}}}}]"
-        else:
-            return f"* {self.title} [{{{{GitCommit|{self.commit_id[2:]}}}}}]"
-
-    def format(self, format: str) -> str:
-        """
-        Format this line
-
-        :attr format: the desired format. Possible values are 'text', 'steam' or 'html'
-        :type string:
-        """
-        if format == 'html':
-            return self.__format_as_html()
-        elif format == 'steam':
-            return self.__format_as_steam()
-        elif format == 'wiki':
-            return self.__format_as_wiki()
-        else:
-            return self.__format_as_text()
-
-
-def format_title(title: str) -> str:
-    title = title.strip()
-    if not title.endswith("."):
-        title = title + "."
-    return title
-
-
-def extract_release_notes(version: str, task_id: int):
-    """
-    Extract all release notes logs
-
-    # Process
-
-    1. Retrieval of description of the given `task_id`.
-    2. Find rows for the given `version` and convert to `ReleaseLogLine`.
-    3. based on the associated task or commit retrieves the title of the log
-       line.
-    """
-    phab = phabricator.Phabricator()
-    phab.update_interfaces()
-    task = phab.maniphest.info(task_id=task_id)
-    description = task["description"]
-    lines = description.split("\n")
-    start_index = lines.index(f"## Blender {version} ##")
-    lines = lines[start_index + 1:]
-    for line in lines:
-        if not line.strip():
-            continue
-        if line.startswith("| **Report**"):
-            continue
-        if line.startswith("## Blender"):
-            break
-
-        log_line = ReleaseLogLine(line)
-        if log_line.task_id:
-            issue_task = phab.maniphest.info(task_id=log_line.task_id)
-            log_line.title = format_title(issue_task.title)
-            yield log_line
-        elif log_line.commit_id:
-            commits = phab.diffusion.commit.search(constraints={"identifiers": [log_line.commit_id]})
-            commit = commits.data[0]
-            commit_message = commit['fields']['message']
-            commit_title = commit_message.split("\n")[0]
-            log_line.title = format_title(commit_title)
-            yield log_line
-
-
-def print_release_notes(version: str, format: str, task_id: int):
-    """
-        Generate and print the release notes to the console.
-    """
-    if format == 'html':
-        print("<ul>")
-    if format == 'steam':
-        print("[ul]")
-    for log_item in extract_release_notes(version=version, task_id=task_id):
-        print(log_item.format(format=format))
-    if format == 'html':
-        print("</ul>")
-    if format == 'steam':
-        print("[/ul]")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE)
-    parser.add_argument(
-        "--version",
-        required=True,
-        help="Version string in the form of {major}.{minor}.{build} (e.g. 2.83.7)")
-    parser.add_argument(
-        "--task",
-        required=True,
-        help="Phabricator ticket that is contains the release notes information (e.g. T77348)")
-    parser.add_argument(
-        "--format",
-        help="Format the result in `text`, `steam`, `wiki` or `html`",
-        default="text")
-    args = parser.parse_args()
-
-    print_release_notes(version=args.version, format=args.format, task_id=int(args.task[1:]))
+lts_issue.print_notes(version=version, format=args.format, issue=issue)
--- a/release/lts/create_download_urls.py
+++ b/release/lts/create_download_urls.py
@@ -1,14 +1,9 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0-or-later

-import argparse
 import datetime


-DESCRIPTION = ("This python script is used to generate the download urls "
-               "which we can copy-paste directly into the CMS of "
-               "www.blender.org")
-USAGE = "create_download_urls --version=2.83.7"
 # Used date format: "September 30, 2020"
 DATE_FORMAT = "%B %d, %Y"

@@ -62,19 +57,8 @@ def generate_html(version: Version) -> str:
    return "\n".join(lines)


-def print_download_urls(version: Version):
+def print_urls(version: str):
    """
    Generate the download urls and print them to the console.
    """
-    print(generate_html(version))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE)
-    parser.add_argument("--version",
-                        required=True,
-                        help=("Version string in the form of {major}.{minor}."
-                              "{build} (eg 2.83.7)"))
-    args = parser.parse_args()
-
-    print_download_urls(version=Version(args.version))
+    print(generate_html(Version(version)))
--- a/release/lts/lts_issue.py
+++ b/release/lts/lts_issue.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+import requests
+
+
+class ReleaseLogLine:
+    """
+    Class containing the information of a single line of the release log
+
+    Instance attributes:
+
+    * line: (str) the original line used to create this log line
+    * issue_id: (int or None) the extracted issue id associated with this log
+               line. Can be None if the log line isn't associated with a issue.
+    * commit_id: (str or None) the extracted commit id associated with this log
+               line. Only filled when no `issue_id` could be found.
+    * ref: (str) `issue_id` or `commit_id` of this line, including `T` for issues
+            or `D` for diffs.
+    * title: (str) title of this log line. When constructed this attribute is
+            an empty string. The called needs to retrieve the title from the
+            backend.
+    * url: (str) url of the ticket issue or commit.
+    """
+
+    def __init__(self, line: str):
+        self.line = line
+        items = line.split("|")
+        self.issue_id = None
+        self.issue_repo = None
+        self.commit_id = None
+        self.commit_repo = None
+        base_url = "https://projects.blender.org"
+        try:
+            issue_tokens = items[1].strip().split("#")
+            if len(issue_tokens[0]) > 0:
+                self.issue_repo = issue_tokens[0]
+                self.issue_id = issue_tokens[1]
+            else:
+                self.issue_repo = "blender/blender"
+                self.issue_id = issue_tokens[1]
+
+            self.ref = f"#{self.issue_id}"
+            self.url = f"{base_url}/{self.issue_repo}/issues/{self.issue_id}"
+        except IndexError:
+            # no issue
+            commit_string = items[3].strip()
+            commit_string = commit_string.split(",")[0]
+            commit_string = commit_string.split("]")[0]
+            commit_string = commit_string.replace("[", "")
+
+            commit_tokens = commit_string.split("@")
+            if len(commit_tokens) > 1:
+                self.commit_repo = commit_tokens[0]
+                self.commit_id = commit_tokens[1]
+            else:
+                self.commit_repo = "blender/blender"
+                self.commit_id = commit_tokens[0]
+
+            self.ref = f"{self.commit_id}"
+            self.url = f"{base_url}/{self.commit_repo}/commit/{self.commit_id}"
+
+        self.title = ""
+
+    def __format_as_html(self) -> str:
+        return f"  <li>{self.title} [<a href=\"{self.url}\">{self.ref}</a>]</li>"
+
+    def __format_as_text(self) -> str:
+        return f"* {self.title} [{self.ref}]"
+
+    def __format_as_steam(self) -> str:
+        return f"* {self.title} ([url={self.url}]{self.ref}[/url])"
+
+    def __format_as_wiki(self) -> str:
+        if self.issue_id:
+            return f"* {self.title} [{{{{BugReport|{self.issue_id}}}}}]"
+        else:
+            return f"* {self.title} [{{{{GitCommit|{self.commit_id[2:]}}}}}]"
+
+    def format(self, format: str) -> str:
+        """
+        Format this line
+
+        :attr format: the desired format. Possible values are 'text', 'steam' or 'html'
+        :type string:
+        """
+        if format == 'html':
+            return self.__format_as_html()
+        elif format == 'steam':
+            return self.__format_as_steam()
+        elif format == 'wiki':
+            return self.__format_as_wiki()
+        else:
+            return self.__format_as_text()
+
+
+def format_title(title: str) -> str:
+    title = title.strip()
+    if not title.endswith("."):
+        title = title + "."
+    return title
+
+
+def extract_release_notes(version: str, issue: str):
+    """
+    Extract all release notes logs
+
+    # Process
+
+    1. Retrieval of description of the given `issue_id`.
+    2. Find rows for the given `version` and convert to `ReleaseLogLine`.
+    3. based on the associated issue or commit retrieves the title of the log
+       line.
+    """
+    base_url = "https://projects.blender.org/api/v1/repos"
+    issues_url = base_url + "/blender/blender/issues/"
+    headers = {'accept': 'application/json'}
+
+    response = requests.get(issues_url + issue[1:], headers=headers)
+    description = response.json()["body"]
+
+    lines = description.split("\n")
+    start_index = lines.index(f"## Blender {version}")
+    lines = lines[start_index + 1:]
+    for line in lines:
+        if not line.strip():
+            continue
+        if line.startswith("| **Report**"):
+            continue
+        if line.startswith("## Blender"):
+            break
+        if line.find("| -- |") != -1:
+            continue
+
+        log_line = ReleaseLogLine(line)
+        if log_line.issue_id:
+            issue_url = f"{base_url}/{log_line.issue_repo}/issues/{log_line.issue_id}"
+            response = requests.get(issue_url, headers=headers)
+            if response.status_code != 200:
+                raise ValueError("Issue not found: " + str(log_line.issue_id))
+
+            log_line.title = format_title(response.json()["title"])
+            yield log_line
+        elif log_line.commit_id:
+            commit_url = f"{base_url}/{log_line.commit_repo}/git/commits/{log_line.commit_id}"
+            response = requests.get(commit_url, headers=headers)
+            if response.status_code != 200:
+                raise ValueError("Commit not found: " + log_line.commit_id)
+
+            commit_message = response.json()['commit']['message']
+            commit_title = commit_message.split("\n")[0]
+            log_line.title = format_title(commit_title)
+            yield log_line
+
+
+def print_notes(version: str, format: str, issue: str):
+    """
+        Generate and print the release notes to the console.
+    """
+    if format == 'html':
+        print("<ul>")
+    if format == 'steam':
+        print("[ul]")
+    for log_item in extract_release_notes(version=version, issue=issue):
+        print(log_item.format(format=format))
+    if format == 'html':
+        print("</ul>")
+    if format == 'steam':
+        print("[/ul]")
--- a/release/lts/requirements.txt
+++ b/release/lts/requirements.txt
@@ -1 +1 @@
-phabricator==0.7.0
+requests
--- a/release/scripts/addons
+++ b/release/scripts/addons
--- a/release/scripts/modules/bpy/utils/init.py
+++ b/release/scripts/modules/bpy/utils/init.py
@@ -1002,11 +1002,11 @@ def unregister_tool(tool_cls):

 # we start with the built-in default mapping
 def _blender_default_map():
-    import rna_manual_reference as ref_mod
-    ret = (ref_mod.url_manual_prefix, ref_mod.url_manual_mapping)
-    # avoid storing in memory
-    del _sys.modules["rna_manual_reference"]
-    return ret
+    # NOTE(@ideasman42): Avoid importing this as there is no need to keep the lookup table in memory.
+    # As this runs when the user accesses the "Online Manual", the overhead loading the file is acceptable.
+    # In my tests it's under 1/100th of a second loading from a `pyc`.
+    ref_mod = execfile(_os.path.join(_script_base_dir, "modules", "rna_manual_reference.py"))
+    return (ref_mod.url_manual_prefix, ref_mod.url_manual_mapping)


 # hooks for doc lookups
--- a/release/scripts/presets/keyconfig/keymap_data/blender_default.py
+++ b/release/scripts/presets/keyconfig/keymap_data/blender_default.py
@@ -1299,6 +1299,8 @@ def km_uv_editor(params):
         {"properties": [("data_path", 'tool_settings.snap_uv_element')]}),
        ("wm.context_toggle", {"type": 'ACCENT_GRAVE', "value": 'PRESS', "ctrl": True},
         {"properties": [("data_path", 'space_data.show_gizmo')]}),
+        ("wm.context_toggle", {"type": 'Z', "value": 'PRESS', "alt": True, "shift": True},
+         {"properties": [("data_path", "space_data.overlay.show_overlays")]}),
        *_template_items_context_menu("IMAGE_MT_uvs_context_menu", params.context_menu_event),
    ])

@@ -1968,6 +1970,8 @@ def km_image(params):
        ("image.clear_render_border", {"type": 'B', "value": 'PRESS', "ctrl": True, "alt": True}, None),
        ("wm.context_toggle", {"type": 'ACCENT_GRAVE', "value": 'PRESS', "ctrl": True},
         {"properties": [("data_path", 'space_data.show_gizmo')]}),
+        ("wm.context_toggle", {"type": 'Z', "value": 'PRESS', "alt": True, "shift": True},
+         {"properties": [("data_path", "space_data.overlay.show_overlays")]}),
        *_template_items_context_menu("IMAGE_MT_mask_context_menu", params.context_menu_event),
    ])

@@ -2914,6 +2918,8 @@ def km_sequencer(params):
         {"properties": [("side", 'RIGHT')]}),
        ("wm.context_toggle", {"type": 'TAB', "value": 'PRESS', "shift": True},
         {"properties": [("data_path", 'tool_settings.use_snap_sequencer')]}),
+        ("wm.context_toggle", {"type": 'Z', "value": 'PRESS', "alt": True, "shift": True},
+         {"properties": [("data_path", "space_data.show_overlays")]}),
        *_template_items_context_menu("SEQUENCER_MT_context_menu", params.context_menu_event),
    ])

@@ -6338,6 +6344,8 @@ def km_node_link_modal_map(_params):
    return keymap

 # Fallback for gizmos that don't have custom a custom key-map.
+
+
 def km_generic_gizmo(_params):
    keymap = (
        "Generic Gizmo",
--- a/release/scripts/startup/bl_ui/node_add_menu_geometry.py
+++ b/release/scripts/startup/bl_ui/node_add_menu_geometry.py
@@ -338,6 +338,7 @@ class NODE_MT_geometry_node_GEO_MESH_READ(Menu):
        node_add_menu.add_node_type(layout, "GeometryNodeInputMeshEdgeAngle")
        node_add_menu.add_node_type(layout, "GeometryNodeInputMeshEdgeNeighbors")
        node_add_menu.add_node_type(layout, "GeometryNodeInputMeshEdgeVertices")
+        node_add_menu.add_node_type(layout, "GeometryNodeEdgesToFaceGroups")
        node_add_menu.add_node_type(layout, "GeometryNodeInputMeshFaceArea")
        node_add_menu.add_node_type(layout, "GeometryNodeInputMeshFaceNeighbors")
        node_add_menu.add_node_type(layout, "GeometryNodeMeshFaceSetBoundaries")
--- a/release/scripts/startup/bl_ui/properties_render.py
+++ b/release/scripts/startup/bl_ui/properties_render.py
@@ -469,6 +469,11 @@ class RENDER_PT_eevee_next_shadows(RenderButtonsPanel, Panel):
    def poll(cls, context):
        return (context.engine in cls.COMPAT_ENGINES)

+    def draw_header(self, context):
+        scene = context.scene
+        props = scene.eevee
+        self.layout.prop(props, "use_shadows", text="")
+
    def draw(self, context):
        layout = self.layout
        layout.use_property_split = True
--- a/release/scripts/startup/bl_ui/space_sequencer.py
+++ b/release/scripts/startup/bl_ui/space_sequencer.py
@@ -1996,7 +1996,7 @@ class SEQUENCER_PT_adjust_sound(SequencerButtonsPanel, Panel):

            split = col.split(factor=0.4)
            split.alignment = 'RIGHT'
-            split.label(text="Pan", heading_ctxt=i18n_contexts.id_sound)
+            split.label(text="Pan", text_ctxt=i18n_contexts.id_sound)
            split.prop(strip, "pan", text="")
            split.enabled = pan_enabled

--- a/release/scripts/startup/bl_ui/space_toolsystem_toolbar.py
+++ b/release/scripts/startup/bl_ui/space_toolsystem_toolbar.py
--- a/release/scripts/startup/bl_ui/space_userpref.py
+++ b/release/scripts/startup/bl_ui/space_userpref.py
@@ -2267,7 +2267,7 @@ class ExperimentalPanel:
    bl_region_type = 'WINDOW'
    bl_context = "experimental"

-    url_prefix = "https://developer.blender.org/"
+    url_prefix = "https://projects.blender.org/"

    @classmethod
    def poll(cls, _context):
@@ -2308,8 +2308,8 @@ class USERPREF_PT_experimental_virtual_reality(ExperimentalPanel, Panel):
    def draw(self, context):
        self._draw_items(
            context, (
-                ({"property": "use_virtual_reality_scene_inspection"}, "T71347"),
-                ({"property": "use_virtual_reality_immersive_drawing"}, "T71348"),
+                ({"property": "use_virtual_reality_scene_inspection"}, ("blender/blender/issues/71347", "#71347")),
+                ({"property": "use_virtual_reality_immersive_drawing"}, ("blender/blender/issues/71348", "#71348")),
            ),
        )
 """
@@ -2319,13 +2319,18 @@ class USERPREF_PT_experimental_new_features(ExperimentalPanel, Panel):
    bl_label = "New Features"

    def draw(self, context):
-        self._draw_items(
-            context, (
-                ({"property": "use_sculpt_tools_tilt"}, "T82877"),
-                ({"property": "use_extended_asset_browser"}, ("project/view/130/", "Project Page")),
-                ({"property": "use_override_templates"}, ("T73318", "Milestone 4")),
-            ),
-        )
+        self._draw_items(context,
+                         (({"property": "use_sculpt_tools_tilt"},
+                           ("blender/blender/issues/82877",
+                            "#82877")),
+                             ({"property": "use_extended_asset_browser"},
+                              ("blender/blender/projects/10",
+                                 "Pipeline, Assets & IO Project Page")),
+                             ({"property": "use_override_templates"},
+                              ("blender/blender/issues/73318",
+                                 "Milestone 4")),
+                          ),
+                         )


 class USERPREF_PT_experimental_prototypes(ExperimentalPanel, Panel):
@@ -2334,12 +2339,12 @@ class USERPREF_PT_experimental_prototypes(ExperimentalPanel, Panel):
    def draw(self, context):
        self._draw_items(
            context, (
-                ({"property": "use_new_curves_tools"}, "T68981"),
-                ({"property": "use_new_point_cloud_type"}, "T75717"),
-                ({"property": "use_sculpt_texture_paint"}, "T96225"),
-                ({"property": "use_full_frame_compositor"}, "T88150"),
-                ({"property": "enable_eevee_next"}, "T93220"),
-                ({"property": "enable_workbench_next"}, "T101619"),
+                ({"property": "use_new_curves_tools"}, ("blender/blender/issues/68981", "#68981")),
+                ({"property": "use_new_point_cloud_type"}, ("blender/blender/issues/75717", "#75717")),
+                ({"property": "use_sculpt_texture_paint"}, ("blender/blender/issues/96225", "#96225")),
+                ({"property": "use_full_frame_compositor"}, ("blender/blender/issues/88150", "#88150")),
+                ({"property": "enable_eevee_next"}, ("blender/blender/issues/93220", "#93220")),
+                ({"property": "enable_workbench_next"}, ("blender/blender/issues/101619", "#101619")),
            ),
        )

@@ -2352,7 +2357,7 @@ class USERPREF_PT_experimental_tweaks(ExperimentalPanel, Panel):
    def draw(self, context):
        self._draw_items(
            context, (
-                ({"property": "use_select_nearest_on_first_click"}, "T96752"),
+                ({"property": "use_select_nearest_on_first_click"}, ("blender/blender/issues/96752", "#96752")),
            ),
        )

@@ -2371,8 +2376,8 @@ class USERPREF_PT_experimental_debugging(ExperimentalPanel, Panel):
    def draw(self, context):
        self._draw_items(
            context, (
-                ({"property": "use_undo_legacy"}, "T60695"),
-                ({"property": "override_auto_resync"}, "T83811"),
+                ({"property": "use_undo_legacy"}, ("blender/blender/issues/60695", "#60695")),
+                ({"property": "override_auto_resync"}, ("blender/blender/issues/83811", "#83811")),
                ({"property": "use_cycles_debug"}, None),
                ({"property": "show_asset_debug_info"}, None),
                ({"property": "use_asset_indexing"}, None),
--- a/release/text/readme.html
+++ b/release/text/readme.html
@@ -96,8 +96,8 @@ Chat <a href="https://blender.chat/channel/today">
 <p class="p5">
 <span class="s3">Development <a href="https://www.blender.org/get-involved/developers/">
 <span class="s4">www.blender.org/get-involved/developers/</span></a><br>
-GIT and Bug Tracker <a href="https://developer.blender.org/">
-<span class="s4">developer.blender.org</span></a><br>
+GIT and Bug Tracker <a href="https://projects.blender.org/">
+<span class="s4">projects.blender.org</span></a><br>
 Chat <a href="https://blender.chat/channel/blender-coders">
 <span class="s4">#blender-coders</span></a> on blender.chat</span>
 </p>
--- a/release/windows/batch/blender_debug_gpu.cmd
+++ b/release/windows/batch/blender_debug_gpu.cmd
@@ -3,7 +3,7 @@ echo Starting blender with GPU debugging options, log files will be created
 echo in your temp folder, windows explorer will open after you close blender
 echo to help you find them.
 echo.
-echo If you report a bug on https://developer.blender.org you can attach these files
+echo If you report a bug on https://projects.blender.org you can attach these files
 echo by dragging them into the text area of your bug report, please include both
 echo blender_debug_output.txt and blender_system_info.txt in your report. 
 echo.
--- a/release/windows/batch/blender_debug_gpu_glitchworkaround.cmd
+++ b/release/windows/batch/blender_debug_gpu_glitchworkaround.cmd
@@ -3,7 +3,7 @@ echo Starting blender with GPU debugging and glitch workaround options, log file
 echo will be created in your temp folder, windows explorer will open after you 
 echo close blender to help you find them.
 echo.
-echo If you report a bug on https://developer.blender.org you can attach these files
+echo If you report a bug on https://projects.blender.org you can attach these files
 echo by dragging them into the text area of your bug report, please include both
 echo blender_debug_output.txt and blender_system_info.txt in your report. 
 echo.
--- a/release/windows/batch/blender_debug_log.cmd
+++ b/release/windows/batch/blender_debug_log.cmd
@@ -3,7 +3,7 @@ echo Starting blender with debug logging options, log files will be created
 echo in your temp folder, windows explorer will open after you close blender
 echo to help you find them.
 echo.
-echo If you report a bug on https://developer.blender.org you can attach these files
+echo If you report a bug on https://projects.blender.org you can attach these files
 echo by dragging them into the text area of your bug report, please include both
 echo blender_debug_output.txt and blender_system_info.txt in your report. 
 echo.
--- a/release/windows/batch/blender_factory_startup.cmd
+++ b/release/windows/batch/blender_factory_startup.cmd
@@ -3,7 +3,7 @@ echo Starting blender with factory settings, log files will be created
 echo in your temp folder, windows explorer will open after you close blender
 echo to help you find them.
 echo.
-echo If you report a bug on https://developer.blender.org you can attach these files
+echo If you report a bug on https://projects.blender.org you can attach these files
 echo by dragging them into the text area of your bug report, please include both
 echo blender_debug_output.txt and blender_system_info.txt in your report. 
 echo.
--- a/source/blender/blenfont/intern/blf_internal_types.h
+++ b/source/blender/blenfont/intern/blf_internal_types.h
@@ -40,7 +40,7 @@ typedef int32_t ft_pix;
 /* Macros copied from `include/freetype/internal/ftobjs.h`. */

 /**
- * FIXME(@campbellbarton): Follow rounding from Blender 3.1x and older.
+ * FIXME(@ideasman42): Follow rounding from Blender 3.1x and older.
 * This is what users will expect and changing this creates wider spaced text.
 * Use this macro to communicate that rounding should be used, using floor is to avoid
 * user visible changes, which can be reviewed and handled separately.
--- a/source/blender/blenkernel/BKE_crazyspace.hh
+++ b/source/blender/blenkernel/BKE_crazyspace.hh
@@ -6,8 +6,7 @@

 #pragma once

-#include "BLI_float3x3.hh"
-#include "BLI_math_vector_types.hh"
+#include "BLI_math_matrix.hh"
 #include "BLI_span.hh"

 struct Depsgraph;
@@ -38,7 +37,7 @@ struct GeometryDeformation {
      return translation;
    }
    const float3x3 &deform_mat = this->deform_mats[position_i];
-    return deform_mat.inverted() * translation;
+    return math::transform_point(math::invert(deform_mat), translation);
  }
 };

--- a/source/blender/blenkernel/BKE_curves.hh
+++ b/source/blender/blenkernel/BKE_curves.hh
@@ -2,31 +2,25 @@

 #pragma once

-#include "BKE_curves.h"
-
 /** \file
 * \ingroup bke
 * \brief Low-level operations for curves.
 */

-#include <mutex>
-
 #include "BLI_bounds_types.hh"
-#include "BLI_cache_mutex.hh"
-#include "BLI_float3x3.hh"
-#include "BLI_float4x4.hh"
 #include "BLI_generic_virtual_array.hh"
 #include "BLI_index_mask.hh"
+#include "BLI_math_matrix_types.hh"
 #include "BLI_math_vector_types.hh"
 #include "BLI_offset_indices.hh"
 #include "BLI_shared_cache.hh"
 #include "BLI_span.hh"
-#include "BLI_task.hh"
 #include "BLI_vector.hh"
 #include "BLI_virtual_array.hh"

 #include "BKE_attribute.hh"
 #include "BKE_attribute_math.hh"
+#include "BKE_curves.h"

 namespace blender::bke {

--- a/source/blender/blenkernel/BKE_geometry_set_instances.hh
+++ b/source/blender/blenkernel/BKE_geometry_set_instances.hh
@@ -2,7 +2,7 @@

 #pragma once

-#include "BLI_float4x4.hh"
+#include "BLI_math_matrix_types.hh"

 #include "BKE_geometry_set.hh"

--- a/source/blender/blenkernel/BKE_instances.hh
+++ b/source/blender/blenkernel/BKE_instances.hh
@@ -19,7 +19,7 @@

 #include <mutex>

-#include "BLI_float4x4.hh"
+#include "BLI_math_matrix_types.hh"
 #include "BLI_vector.hh"
 #include "BLI_vector_set.hh"

--- a/source/blender/blenkernel/BKE_lib_id.h
+++ b/source/blender/blenkernel/BKE_lib_id.h
@@ -497,6 +497,7 @@ void BKE_lib_id_expand_local(struct Main *bmain, struct ID *id, int flags);
 *
 * Only for local IDs (linked ones already have a unique ID in their library).
 *
+ * \param name: The new name of the given ID, if NULL the current given ID name is used instead.
 * \param do_linked_data: if true, also ensure a unique name in case the given \a id is linked
 * (otherwise, just ensure that it is properly sorted).
 *
--- a/source/blender/blenkernel/BKE_lib_query.h
+++ b/source/blender/blenkernel/BKE_lib_query.h
@@ -142,6 +142,9 @@ enum {
  /** Also process internal ID pointers like `ID.newid` or `ID.orig_id`.
   *  WARNING: Dangerous, use with caution. */
  IDWALK_DO_INTERNAL_RUNTIME_POINTERS = (1 << 9),
+  /** Also process the ID.lib pointer. It is an option because this pointer can usually be fully
+     ignored. */
+  IDWALK_DO_LIBRARY_POINTER = (1 << 10),
 };

 typedef struct LibraryForeachIDData LibraryForeachIDData;
--- a/source/blender/blenkernel/BKE_lib_remap.h
+++ b/source/blender/blenkernel/BKE_lib_remap.h
@@ -222,8 +222,12 @@ void BKE_id_remapper_clear(struct IDRemapper *id_remapper);
 bool BKE_id_remapper_is_empty(const struct IDRemapper *id_remapper);
 /** Free the given ID Remapper. */
 void BKE_id_remapper_free(struct IDRemapper *id_remapper);
-/** Add a new remapping. */
+/** Add a new remapping. Does not replace an existing mapping for `old_id`, if any. */
 void BKE_id_remapper_add(struct IDRemapper *id_remapper, struct ID *old_id, struct ID *new_id);
+/** Add a new remapping, replacing a potential already existing mapping of `old_id`. */
+void BKE_id_remapper_add_overwrite(struct IDRemapper *id_remapper,
+                                   struct ID *old_id,
+                                   struct ID *new_id);

 /**
 * Apply a remapping.
--- a/source/blender/blenkernel/BKE_main.h
+++ b/source/blender/blenkernel/BKE_main.h
@@ -425,7 +425,7 @@ int set_listbasepointers(struct Main *main, struct ListBase *lb[]);
 /**
 * The size of thumbnails (optionally) stored in the `.blend` files header.
 *
- * NOTE(@campbellbarton): This is kept small as it's stored uncompressed in the `.blend` file,
+ * NOTE(@ideasman42): This is kept small as it's stored uncompressed in the `.blend` file,
 * where a larger size would increase the size of every `.blend` file unreasonably.
 * If we wanted to increase the size, we'd want to use compression (JPEG or similar).
 */
--- a/source/blender/blenkernel/BKE_main_namemap.h
+++ b/source/blender/blenkernel/BKE_main_namemap.h
@@ -29,6 +29,15 @@ struct UniqueName_Map;
 struct UniqueName_Map *BKE_main_namemap_create(void) ATTR_WARN_UNUSED_RESULT;
 void BKE_main_namemap_destroy(struct UniqueName_Map **r_name_map) ATTR_NONNULL();

+/**
+ * Destroy all name_maps in given bmain:
+ * - In bmain itself for local IDs.
+ * - In the split bmains in the list is any (for linked IDs in some cases, e.g. if called during
+ *   readfile code).
+ * - In all of the libraries IDs (for linked IDs).
+ */
+void BKE_main_namemap_clear(struct Main *bmain) ATTR_NONNULL();
+
 /**
 * Ensures the given name is unique within the given ID type.
 *
--- a/Show More
+++ b/Show More