2024-04-22 19:49:08 +02:00
420 changed files with 7169 additions and 6092 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -1,5 +1,4 @@
-This repository is only used as a mirror of git.blender.org. Blender development happens on
-https://developer.blender.org.
+This repository is only used as a mirror. Blender development happens on projects.blender.org.

 To get started with contributing code, please see:
 https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.github/stale.yml
+++ b/.github/stale.yml
@ -15,8 +15,7 @@ staleLabel: stale
 # Comment to post when closing a stale Issue or Pull Request.
 closeComment: >
  This issue has been automatically closed, because this repository is only
-  used as a mirror of git.blender.org. Blender development happens on
-  developer.blender.org.
+  used as a mirror. Blender development happens on projects.blender.org.

  To get started contributing code, please read:
  https://wiki.blender.org/wiki/Process/Contributing_Code
--- a/.gitmodules
+++ b/.gitmodules
@ -1,20 +1,16 @@
 [submodule "release/scripts/addons"]
 	path = release/scripts/addons
 	url = ../blender-addons.git
-	branch = master
-	ignore = all
+	branch = main
 [submodule "release/scripts/addons_contrib"]
 	path = release/scripts/addons_contrib
 	url = ../blender-addons-contrib.git
-	branch = master
-	ignore = all
+	branch = main
 [submodule "release/datafiles/locale"]
 	path = release/datafiles/locale
 	url = ../blender-translations.git
-	branch = master
-	ignore = all
+	branch = main
 [submodule "source/tools"]
 	path = source/tools
 	url = ../blender-dev-tools.git
-	branch = master
-	ignore = all
+	branch = main
--- a/README.md
+++ b/README.md
@ -24,7 +24,7 @@ Development
 -----------

 - [Build Instructions](https://wiki.blender.org/wiki/Building_Blender)
- [Code Review & Bug Tracker](https://developer.blender.org)
+- [Code Review & Bug Tracker](https://projects.blender.org)
 - [Developer Forum](https://devtalk.blender.org)
 - [Developer Documentation](https://wiki.blender.org)

--- a/build_files/cmake/buildinfo.cmake
+++ b/build_files/cmake/buildinfo.cmake
@ -23,19 +23,19 @@ if(EXISTS ${SOURCE_DIR}/.git)

  if(MY_WC_BRANCH STREQUAL "HEAD")
    # Detached HEAD, check whether commit hash is reachable
-    # in the master branch
+    # in the main branch
    execute_process(COMMAND git rev-parse --short=12 HEAD
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE MY_WC_HASH
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

-    execute_process(COMMAND git branch --list master blender-v* --contains ${MY_WC_HASH}
+    execute_process(COMMAND git branch --list main blender-v* --contains ${MY_WC_HASH}
                    WORKING_DIRECTORY ${SOURCE_DIR}
                    OUTPUT_VARIABLE _git_contains_check
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

    if(NOT _git_contains_check STREQUAL "")
-      set(MY_WC_BRANCH "master")
+      set(MY_WC_BRANCH "main")
    else()
      execute_process(COMMAND git show-ref --tags -d
                      WORKING_DIRECTORY ${SOURCE_DIR}
@ -48,7 +48,7 @@ if(EXISTS ${SOURCE_DIR}/.git)
                      OUTPUT_STRIP_TRAILING_WHITESPACE)

      if(_git_tag_hashes MATCHES "${_git_head_hash}")
-        set(MY_WC_BRANCH "master")
+        set(MY_WC_BRANCH "main")
      else()
        execute_process(COMMAND git branch --contains ${MY_WC_HASH}
                        WORKING_DIRECTORY ${SOURCE_DIR}
--- a/build_files/cmake/example_scripts/cmake_linux_install.sh
+++ b/build_files/cmake/example_scripts/cmake_linux_install.sh
@ -11,11 +11,11 @@
 mkdir ~/blender-git
 cd ~/blender-git

-git clone http://git.blender.org/blender.git
+git clone https://projects.blender.org/blender/blender.git
 cd blender
 git submodule update --init --recursive
-git submodule foreach git checkout master
-git submodule foreach git pull --rebase origin master
+git submodule foreach git checkout main
+git submodule foreach git pull --rebase origin main

 # create build dir
 mkdir ~/blender-git/build-cmake
@ -35,7 +35,7 @@ ln -s ~/blender-git/build-cmake/bin/blender ~/blender-git/blender/blender.bin
 echo ""
 echo "* Useful Commands *"
 echo "   Run Blender: ~/blender-git/blender/blender.bin"
-echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin master"
+echo "   Update Blender: git pull --rebase; git submodule foreach git pull --rebase origin main"
 echo "   Reconfigure Blender: cd ~/blender-git/build-cmake ; cmake ."
 echo "   Build Blender: cd ~/blender-git/build-cmake ; make"
 echo ""
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@ -5,16 +5,16 @@
 update-code:
    git:
        submodules:
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/scripts/addons
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/scripts/addons_contrib
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: release/datafiles/locale
-        -   branch: master
+        -   branch: main
            commit_id: HEAD
            path: source/tools
    svn:
--- a/build_files/utils/make_bpy_wheel.py
+++ b/build_files/utils/make_bpy_wheel.py
@ -58,7 +58,7 @@ Each Blender release supports one Python version, and the package is only compat
 ## Source Code

 * [Releases](https://download.blender.org/source/)
-* Repository: [git.blender.org/blender.git](https://git.blender.org/gitweb/gitweb.cgi/blender.git)
+* Repository: [projects.blender.org/blender/blender.git](https://projects.blender.org/blender/blender)

 ## Credits

--- a/build_files/utils/make_update.py
+++ b/build_files/utils/make_update.py
@ -170,7 +170,7 @@ def git_update_skip(args: argparse.Namespace, check_remote_exists: bool = True)
        return "rebase or merge in progress, complete it first"

    # Abort if uncommitted changes.
-    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no'])
+    changes = check_output([args.git_command, 'status', '--porcelain', '--untracked-files=no', '--ignore-submodules'])
    if len(changes) != 0:
        return "you have unstaged changes"

@ -202,8 +202,8 @@ def submodules_update(
        sys.exit(1)

    # Update submodules to appropriate given branch,
-    # falling back to master if none is given and/or found in a sub-repository.
-    branch_fallback = "master"
+    # falling back to main if none is given and/or found in a sub-repository.
+    branch_fallback = "main"
    if not branch:
        branch = branch_fallback

--- a/build_files/windows/check_submodules.cmd
+++ b/build_files/windows/check_submodules.cmd
@ -3,9 +3,9 @@ if NOT exist "%BLENDER_DIR%\source\tools\.git" (
 	if not "%GIT%" == "" (
 		"%GIT%" submodule update --init --recursive --progress
 		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git checkout master
+		"%GIT%" submodule foreach git checkout main
 		if errorlevel 1 goto FAIL
-		"%GIT%" submodule foreach git pull --rebase origin master
+		"%GIT%" submodule foreach git pull --rebase origin main
 		if errorlevel 1 goto FAIL
 		goto EOF
 	) else (
--- a/doc/python_api/examples/blf.py
+++ b/doc/python_api/examples/blf.py
@ -37,7 +37,7 @@ def draw_callback_px(self, context):
    # BLF drawing routine
    font_id = font_info["font_id"]
    blf.position(font_id, 2, 80, 0)
-    blf.size(font_id, 50, 72)
+    blf.size(font_id, 50)
    blf.draw(font_id, "Hello World")


--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@ -1816,9 +1816,9 @@ def pyrna2sphinx(basepath):

    # operators
    def write_ops():
-        API_BASEURL = "https://developer.blender.org/diffusion/B/browse/master/release/scripts"
-        API_BASEURL_ADDON = "https://developer.blender.org/diffusion/BA"
-        API_BASEURL_ADDON_CONTRIB = "https://developer.blender.org/diffusion/BAC"
+        API_BASEURL = "https://projects.blender.org/blender/blender/src/branch/main/release/scripts"
+        API_BASEURL_ADDON = "https://projects.blender.org/blender/blender-addons"
+        API_BASEURL_ADDON_CONTRIB = "https://projects.blender.org/blender/blender-addons-contrib"

        op_modules = {}
        op = None
--- a/doc/python_api/static/js/version_switch.js
+++ b/doc/python_api/static/js/version_switch.js
@ -156,7 +156,7 @@ var Popover = function() {
    },
    getNamed : function(v) {
      $.each(all_versions, function(ix, title) {
-        if (ix === "master" || ix === "latest") {
+        if (ix === "master" || ix === "main" || ix === "latest") {
          var m = title.match(/\d\.\d[\w\d\.]*/)[0];
          if (parseFloat(m) == v) {
            v = ix;
--- a/extern/hipew/README.blender
+++ b/extern/hipew/README.blender
@ -1,5 +1,5 @@
 Project: Blender
-URL: https://git.blender.org/blender.git
+URL: https://projects.blender.org/blender/blender.git
 License: Apache 2.0
 Upstream version: N/A
 Local modifications: None
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -1723,12 +1723,21 @@ class CyclesPreferences(bpy.types.AddonPreferences):

        if compute_device_type == 'METAL':
            import platform
-            # MetalRT only works on Apple Silicon at present, pending argument encoding fixes on AMD
-            # Kernel specialization is only viable on Apple Silicon at present due to relative compilation speed
-            if platform.machine() == 'arm64':
+            import re
+            is_navi_2 = False
+            for device in devices:
+                if re.search(r"((RX)|(Pro)|(PRO))\s+W?6\d00X", device.name):
+                    is_navi_2 = True
+                    break
+
+            # MetalRT only works on Apple Silicon and Navi2.
+            is_arm64 = platform.machine() == 'arm64'
+            if is_arm64 or is_navi_2:
                col = layout.column()
                col.use_property_split = True
-                col.prop(self, "kernel_optimization_level")
+                # Kernel specialization is only supported on Apple Silicon
+                if is_arm64:
+                    col.prop(self, "kernel_optimization_level")
                col.prop(self, "use_metalrt")

    def draw(self, context):
--- a/intern/cycles/blender/light.cpp
+++ b/intern/cycles/blender/light.cpp
@ -48,6 +48,8 @@ void BlenderSync::sync_light(BL::Object &b_parent,
    case BL::Light::type_SPOT: {
      BL::SpotLight b_spot_light(b_light);
      light->set_size(b_spot_light.shadow_soft_size());
+      light->set_axisu(transform_get_column(&tfm, 0));
+      light->set_axisv(transform_get_column(&tfm, 1));
      light->set_light_type(LIGHT_SPOT);
      light->set_spot_angle(b_spot_light.spot_size());
      light->set_spot_smooth(b_spot_light.spot_blend());
--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error)
 }

 CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
+  static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
+
  first_error = true;

  cuDevId = info.num;
@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize CUDA. */
@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
   * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  cuda_assert(
-      cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  int value;
+  cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+  can_map_host = value != 0;

  cuda_assert(cuDeviceGetAttribute(
      &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void CUDADevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void CUDADevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
-      device_memory &mem = *pair.first;
-      CUDAMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple CUDA devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
 {
  CUDAContextScope scope(this);

-  CUdeviceptr device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  cuMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    cuMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = cuMemAlloc(&device_pointer, size);
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = CUDA_SUCCESS;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = cuMemHostAlloc(
-          &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
-
-      assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
-             (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == CUDA_SUCCESS) {
-      cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != CUDA_SUCCESS) {
-    if (mem.type == MEM_DEVICE_ONLY) {
-      status = " failed, out of device memory";
-      set_error("System is out of GPU memory");
-    }
-    else {
-      status = " failed, out of device and host memory";
-      set_error("System is out of GPU and shared host memory");
-    }
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  CUDAMem *cmem = &cuda_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * CUDA memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void CUDADevice::generic_copy_to(device_memory &mem)
+bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  CUDAContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const CUDAContextScope scope(this);
-    cuda_assert(
-        cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
+  return mem_alloc_result == CUDA_SUCCESS;
 }

-void CUDADevice::generic_free(device_memory &mem)
+void CUDADevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+  CUDAContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          cuMemFreeHost(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      cuda_assert(cuMemFree(mem.device_pointer));
-    }
+bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  CUDAContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  CUresult mem_alloc_result = cuMemHostAlloc(
+      &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+  return mem_alloc_result == CUDA_SUCCESS;
+}

-    cuda_mem_map.erase(cuda_mem_map.find(&mem));
-  }
+void CUDADevice::free_host(void *shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuMemFreeHost(shared_pointer);
+}
+
+bool CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  CUDAContextScope scope(this);
+
+  cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
+  return true;
+}
+
+void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const CUDAContextScope scope(this);
+
+  cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
 }

 void CUDADevice::mem_alloc(device_memory &mem)
@ -868,8 +613,8 @@ void CUDADevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(cuda_mem_map_mutex);
-  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const CUDAContextScope scope(this);
    cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
  }
@ -994,19 +739,19 @@ void CUDADevice::tex_alloc(device_texture &mem)
      return;
  }

-  CUDAMem *cmem = NULL;
+  Mem *cmem = NULL;
  CUarray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (CUarray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@ -1050,10 +795,10 @@ void CUDADevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@ -1137,8 +882,8 @@ void CUDADevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    cmem = &cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@ -1153,9 +898,9 @@ void CUDADevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    CUDAContextScope scope(this);
-    thread_scoped_lock lock(cuda_mem_map_mutex);
-    DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
-    const CUDAMem &cmem = cuda_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@ -1164,16 +909,16 @@ void CUDADevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      cuArrayDestroy(cmem.array);
+      cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class CUDADevice : public Device {
+class CUDADevice : public GPUDevice {

  friend class CUDAContextScope;

@ -29,36 +29,11 @@ class CUDADevice : public Device {
  CUdevice cuDevice;
  CUcontext cuContext;
  CUmodule cuModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int cuDevId;
  int cuDevArchitecture;
  bool first_error;

-  struct CUDAMem {
-    CUDAMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    CUtexObject texobject;
-    CUarray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, CUDAMem> CUDAMemMap;
-  CUDAMemMap cuda_mem_map;
-  thread_mutex cuda_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  CUDADeviceKernels kernels;

  static bool have_precompiled_kernels();
@ -88,17 +63,13 @@ class CUDADevice : public Device {

  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@ -452,6 +452,320 @@ void *Device::get_cpu_osl_memory()
  return nullptr;
 }

+GPUDevice::~GPUDevice() noexcept(false)
+{
+}
+
+bool GPUDevice::load_texture_info()
+{
+  if (need_texture_info) {
+    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
+     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
+    need_texture_info = false;
+    texture_info.copy_to_device();
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+
+void GPUDevice::init_host_memory(size_t preferred_texture_headroom,
+                                 size_t preferred_working_headroom)
+{
+  /* Limit amount of host mapped memory, because allocating too much can
+   * cause system instability. Leave at least half or 4 GB of system
+   * memory free, whichever is smaller. */
+  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+  size_t system_ram = system_physical_ram();
+
+  if (system_ram > 0) {
+    if (system_ram / 2 > default_limit) {
+      map_host_limit = system_ram - default_limit;
+    }
+    else {
+      map_host_limit = system_ram / 2;
+    }
+  }
+  else {
+    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
+    map_host_limit = 0;
+  }
+
+  /* Amount of device memory to keep free after texture memory
+   * and working memory allocations respectively. We set the working
+   * memory limit headroom lower than the working one so there
+   * is space left for it. */
+  device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom :
+                                                             32 * 1024 * 1024LL;  // 32MB
+  device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom :
+                                                             128 * 1024 * 1024LL;  // 128MB
+
+  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+}
+
+void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
+{
+  /* Break out of recursive call, which can happen when moving memory on a multi device. */
+  static bool any_device_moving_textures_to_host = false;
+  if (any_device_moving_textures_to_host) {
+    return;
+  }
+
+  /* Signal to reallocate textures in host memory only. */
+  move_texture_to_host = true;
+
+  while (size > 0) {
+    /* Find suitable memory allocation to move. */
+    device_memory *max_mem = NULL;
+    size_t max_size = 0;
+    bool max_is_image = false;
+
+    thread_scoped_lock lock(device_mem_map_mutex);
+    foreach (MemMap::value_type &pair, device_mem_map) {
+      device_memory &mem = *pair.first;
+      Mem *cmem = &pair.second;
+
+      /* Can only move textures allocated on this device (and not those from peer devices).
+       * And need to ignore memory that is already on the host. */
+      if (!mem.is_resident(this) || cmem->use_mapped_host) {
+        continue;
+      }
+
+      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
+                        (&mem != &texture_info);
+      bool is_image = is_texture && (mem.data_height > 1);
+
+      /* Can't move this type of memory. */
+      if (!is_texture || cmem->array) {
+        continue;
+      }
+
+      /* For other textures, only move image textures. */
+      if (for_texture && !is_image) {
+        continue;
+      }
+
+      /* Try to move largest allocation, prefer moving images. */
+      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+        max_is_image = is_image;
+        max_size = mem.device_size;
+        max_mem = &mem;
+      }
+    }
+    lock.unlock();
+
+    /* Move to host memory. This part is mutex protected since
+     * multiple backend devices could be moving the memory. The
+     * first one will do it, and the rest will adopt the pointer. */
+    if (max_mem) {
+      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
+
+      static thread_mutex move_mutex;
+      thread_scoped_lock lock(move_mutex);
+
+      any_device_moving_textures_to_host = true;
+
+      /* Potentially need to call back into multi device, so pointer mapping
+       * and peer devices are updated. This is also necessary since the device
+       * pointer may just be a key here, so cannot be accessed and freed directly.
+       * Unfortunately it does mean that memory is reallocated on all other
+       * devices as well, which is potentially dangerous when still in use (since
+       * a thread rendering on another devices would only be caught in this mutex
+       * if it so happens to do an allocation at the same time as well. */
+      max_mem->device_copy_to();
+      size = (max_size >= size) ? 0 : size - max_size;
+
+      any_device_moving_textures_to_host = false;
+    }
+    else {
+      break;
+    }
+  }
+
+  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
+  move_texture_to_host = false;
+
+  /* Update texture info array with new pointers. */
+  load_texture_info();
+}
+
+GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+{
+  void *device_pointer = 0;
+  size_t size = mem.memory_size() + pitch_padding;
+
+  bool mem_alloc_result = false;
+  const char *status = "";
+
+  /* First try allocating in device memory, respecting headroom. We make
+   * an exception for texture info. It is small and frequently accessed,
+   * so treat it as working memory.
+   *
+   * If there is not enough room for working memory, we will try to move
+   * textures to host memory, assuming the performance impact would have
+   * been worse for working memory. */
+  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
+  bool is_image = is_texture && (mem.data_height > 1);
+
+  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+  size_t total = 0, free = 0;
+  get_device_memory_info(total, free);
+
+  /* Move textures to host memory if needed. */
+  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
+    move_textures_to_host(size + headroom - free, is_texture);
+    get_device_memory_info(total, free);
+  }
+
+  /* Allocate in device memory. */
+  if (!move_texture_to_host && (size + headroom) < free) {
+    mem_alloc_result = alloc_device(device_pointer, size);
+    if (mem_alloc_result) {
+      device_mem_in_use += size;
+      status = " in device memory";
+    }
+  }
+
+  /* Fall back to mapped host memory if needed and possible. */
+
+  void *shared_pointer = 0;
+
+  if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) {
+    if (mem.shared_pointer) {
+      /* Another device already allocated host memory. */
+      mem_alloc_result = true;
+      shared_pointer = mem.shared_pointer;
+    }
+    else if (map_host_used + size < map_host_limit) {
+      /* Allocate host memory ourselves. */
+      mem_alloc_result = alloc_host(shared_pointer, size);
+
+      assert((mem_alloc_result && shared_pointer != 0) ||
+             (!mem_alloc_result && shared_pointer == 0));
+    }
+
+    if (mem_alloc_result) {
+      assert(transform_host_pointer(&device_pointer, shared_pointer));
+      map_host_used += size;
+      status = " in host memory";
+    }
+  }
+
+  if (!mem_alloc_result) {
+    if (mem.type == MEM_DEVICE_ONLY) {
+      status = " failed, out of device memory";
+      set_error("System is out of GPU memory");
+    }
+    else {
+      status = " failed, out of device and host memory";
+      set_error("System is out of GPU and shared host memory");
+    }
+  }
+
+  if (mem.name) {
+    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")" << status;
+  }
+
+  mem.device_pointer = (device_ptr)device_pointer;
+  mem.device_size = size;
+  stats.mem_alloc(size);
+
+  if (!mem.device_pointer) {
+    return NULL;
+  }
+
+  /* Insert into map of allocations. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  Mem *cmem = &device_mem_map[&mem];
+  if (shared_pointer != 0) {
+    /* Replace host pointer with our host allocation. Only works if
+     * memory layout is the same and has no pitch padding. Also
+     * does not work if we move textures to host during a render,
+     * since other devices might be using the memory. */
+
+    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+        mem.host_pointer != shared_pointer) {
+      memcpy(shared_pointer, mem.host_pointer, size);
+
+      /* A Call to device_memory::host_free() should be preceded by
+       * a call to device_memory::device_free() for host memory
+       * allocated by a device to be handled properly. Two exceptions
+       * are here and a call in OptiXDevice::generic_alloc(), where
+       * the current host memory can be assumed to be allocated by
+       * device_memory::host_alloc(), not by a device */
+
+      mem.host_free();
+      mem.host_pointer = shared_pointer;
+    }
+    mem.shared_pointer = shared_pointer;
+    mem.shared_counter++;
+    cmem->use_mapped_host = true;
+  }
+  else {
+    cmem->use_mapped_host = false;
+  }
+
+  return cmem;
+}
+
+void GPUDevice::generic_free(device_memory &mem)
+{
+  if (mem.device_pointer) {
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];
+
+    /* If cmem.use_mapped_host is true, reference counting is used
+     * to safely free a mapped host memory. */
+
+    if (cmem.use_mapped_host) {
+      assert(mem.shared_pointer);
+      if (mem.shared_pointer) {
+        assert(mem.shared_counter > 0);
+        if (--mem.shared_counter == 0) {
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          free_host(mem.shared_pointer);
+          mem.shared_pointer = 0;
+        }
+      }
+      map_host_used -= mem.device_size;
+    }
+    else {
+      /* Free device memory. */
+      free_device((void *)mem.device_pointer);
+      device_mem_in_use -= mem.device_size;
+    }
+
+    stats.mem_free(mem.device_size);
+    mem.device_pointer = 0;
+    mem.device_size = 0;
+
+    device_mem_map.erase(device_mem_map.find(&mem));
+  }
+}
+
+void GPUDevice::generic_copy_to(device_memory &mem)
+{
+  if (!mem.host_pointer || !mem.device_pointer) {
+    return;
+  }
+
+  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
+   * backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should
+   * copy data from mem.host_pointer. */
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+    copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size());
+  }
+}
+
 /* DeviceInfo */

 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -309,6 +309,93 @@ class Device {
  static uint devices_initialized_mask;
 };

+/* Device, which is GPU, with some common functionality for GPU backends */
+class GPUDevice : public Device {
+ protected:
+  GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
+      : Device(info_, stats_, profiler_),
+        texture_info(this, "texture_info", MEM_GLOBAL),
+        need_texture_info(false),
+        can_map_host(false),
+        map_host_used(0),
+        map_host_limit(0),
+        device_texture_headroom(0),
+        device_working_headroom(0),
+        device_mem_map(),
+        device_mem_map_mutex(),
+        move_texture_to_host(false),
+        device_mem_in_use(0)
+  {
+  }
+
+ public:
+  virtual ~GPUDevice() noexcept(false);
+
+  /* For GPUs that can use bindless textures in some way or another. */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+  /* Returns true if the texture info was copied to the device (meaning, some more
+   * re-initialization might be needed). */
+  virtual bool load_texture_info();
+
+ protected:
+  /* Memory allocation, only accessed through device_memory. */
+  friend class device_memory;
+
+  bool can_map_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  typedef unsigned long long texMemObject;
+  typedef unsigned long long arrayMemObject;
+  struct Mem {
+    Mem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    texMemObject texobject;
+    arrayMemObject array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, Mem> MemMap;
+  MemMap device_mem_map;
+  thread_mutex device_mem_map_mutex;
+  bool move_texture_to_host;
+  /* Simple counter which will try to track amount of used device memory */
+  size_t device_mem_in_use;
+
+  virtual void init_host_memory(size_t preferred_texture_headroom = 0,
+                                size_t preferred_working_headroom = 0);
+  virtual void move_textures_to_host(size_t size, bool for_texture);
+
+  /* Allocation, deallocation and copy functions, with corresponding
+   * support of device/host allocations. */
+  virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+  virtual void generic_free(device_memory &mem);
+  virtual void generic_copy_to(device_memory &mem);
+
+  /* total - amount of device memory, free - amount of available device memory */
+  virtual void get_device_memory_info(size_t &total, size_t &free) = 0;
+
+  virtual bool alloc_device(void *&device_pointer, size_t size) = 0;
+
+  virtual void free_device(void *device_pointer) = 0;
+
+  virtual bool alloc_host(void *&shared_pointer, size_t size) = 0;
+
+  virtual void free_host(void *shared_pointer) = 0;
+
+  /* This function should return device pointer corresponding to shared pointer, which
+   * is host buffer, allocated in `alloc_host`. The function should `true`, if such
+   * address transformation is possible and `false` otherwise. */
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0;
+
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0;
+};
+
 CCL_NAMESPACE_END

 #endif /* __DEVICE_H__ */
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@ -53,8 +53,12 @@ void HIPDevice::set_error(const string &error)
 }

 HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
-    : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
+    : GPUDevice(info, stats, profiler)
 {
+  /* Verify that base class types can be used with specific backend types */
+  static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t));
+  static_assert(sizeof(arrayMemObject) == sizeof(hArray));
+
  first_error = true;

  hipDevId = info.num;
@ -65,12 +69,6 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)

  need_texture_info = false;

-  device_texture_headroom = 0;
-  device_working_headroom = 0;
-  move_texture_to_host = false;
-  map_host_limit = 0;
-  map_host_used = 0;
-  can_map_host = 0;
  pitch_alignment = 0;

  /* Initialize HIP. */
@ -91,7 +89,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
  /* hipDeviceMapHost for mapping host memory when out of device memory.
   * hipDeviceLmemResizeToMax for reserving local memory ahead of render,
   * so we can predict which memory to map to host. */
-  hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  int value;
+  hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice));
+  can_map_host = value != 0;

  hip_assert(
      hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
@ -460,305 +460,58 @@ void HIPDevice::reserve_local_memory(const uint kernel_features)
 #  endif
 }

-void HIPDevice::init_host_memory()
-{
-  /* Limit amount of host mapped memory, because allocating too much can
-   * cause system instability. Leave at least half or 4 GB of system
-   * memory free, whichever is smaller. */
-  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-  size_t system_ram = system_physical_ram();
-
-  if (system_ram > 0) {
-    if (system_ram / 2 > default_limit) {
-      map_host_limit = system_ram - default_limit;
-    }
-    else {
-      map_host_limit = system_ram / 2;
-    }
-  }
-  else {
-    VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
-    map_host_limit = 0;
-  }
-
-  /* Amount of device memory to keep is free after texture memory
-   * and working memory allocations respectively. We set the working
-   * memory limit headroom lower so that some space is left after all
-   * texture memory allocations. */
-  device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
-  device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
-
-  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
-            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
-}
-
-void HIPDevice::load_texture_info()
-{
-  if (need_texture_info) {
-    /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
-     * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
-    need_texture_info = false;
-    texture_info.copy_to_device();
-  }
-}
-
-void HIPDevice::move_textures_to_host(size_t size, bool for_texture)
-{
-  /* Break out of recursive call, which can happen when moving memory on a multi device. */
-  static bool any_device_moving_textures_to_host = false;
-  if (any_device_moving_textures_to_host) {
-    return;
-  }
-
-  /* Signal to reallocate textures in host memory only. */
-  move_texture_to_host = true;
-
-  while (size > 0) {
-    /* Find suitable memory allocation to move. */
-    device_memory *max_mem = NULL;
-    size_t max_size = 0;
-    bool max_is_image = false;
-
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    foreach (HIPMemMap::value_type &pair, hip_mem_map) {
-      device_memory &mem = *pair.first;
-      HIPMem *cmem = &pair.second;
-
-      /* Can only move textures allocated on this device (and not those from peer devices).
-       * And need to ignore memory that is already on the host. */
-      if (!mem.is_resident(this) || cmem->use_mapped_host) {
-        continue;
-      }
-
-      bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
-                        (&mem != &texture_info);
-      bool is_image = is_texture && (mem.data_height > 1);
-
-      /* Can't move this type of memory. */
-      if (!is_texture || cmem->array) {
-        continue;
-      }
-
-      /* For other textures, only move image textures. */
-      if (for_texture && !is_image) {
-        continue;
-      }
-
-      /* Try to move largest allocation, prefer moving images. */
-      if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
-        max_is_image = is_image;
-        max_size = mem.device_size;
-        max_mem = &mem;
-      }
-    }
-    lock.unlock();
-
-    /* Move to host memory. This part is mutex protected since
-     * multiple HIP devices could be moving the memory. The
-     * first one will do it, and the rest will adopt the pointer. */
-    if (max_mem) {
-      VLOG_WORK << "Move memory from device to host: " << max_mem->name;
-
-      static thread_mutex move_mutex;
-      thread_scoped_lock lock(move_mutex);
-
-      any_device_moving_textures_to_host = true;
-
-      /* Potentially need to call back into multi device, so pointer mapping
-       * and peer devices are updated. This is also necessary since the device
-       * pointer may just be a key here, so cannot be accessed and freed directly.
-       * Unfortunately it does mean that memory is reallocated on all other
-       * devices as well, which is potentially dangerous when still in use (since
-       * a thread rendering on another devices would only be caught in this mutex
-       * if it so happens to do an allocation at the same time as well. */
-      max_mem->device_copy_to();
-      size = (max_size >= size) ? 0 : size - max_size;
-
-      any_device_moving_textures_to_host = false;
-    }
-    else {
-      break;
-    }
-  }
-
-  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
-  move_texture_to_host = false;
-
-  /* Update texture info array with new pointers. */
-  load_texture_info();
-}
-
-HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
+void HIPDevice::get_device_memory_info(size_t &total, size_t &free)
 {
  HIPContextScope scope(this);

-  hipDeviceptr_t device_pointer = 0;
-  size_t size = mem.memory_size() + pitch_padding;
-
-  hipError_t mem_alloc_result = hipErrorOutOfMemory;
-  const char *status = "";
-
-  /* First try allocating in device memory, respecting headroom. We make
-   * an exception for texture info. It is small and frequently accessed,
-   * so treat it as working memory.
-   *
-   * If there is not enough room for working memory, we will try to move
-   * textures to host memory, assuming the performance impact would have
-   * been worse for working memory. */
-  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
-  bool is_image = is_texture && (mem.data_height > 1);
-
-  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
-
-  size_t total = 0, free = 0;
  hipMemGetInfo(&free, &total);
-
-  /* Move textures to host memory if needed. */
-  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
-    move_textures_to_host(size + headroom - free, is_texture);
-    hipMemGetInfo(&free, &total);
-  }
-
-  /* Allocate in device memory. */
-  if (!move_texture_to_host && (size + headroom) < free) {
-    mem_alloc_result = hipMalloc(&device_pointer, size);
-    if (mem_alloc_result == hipSuccess) {
-      status = " in device memory";
-    }
-  }
-
-  /* Fall back to mapped host memory if needed and possible. */
-
-  void *shared_pointer = 0;
-
-  if (mem_alloc_result != hipSuccess && can_map_host) {
-    if (mem.shared_pointer) {
-      /* Another device already allocated host memory. */
-      mem_alloc_result = hipSuccess;
-      shared_pointer = mem.shared_pointer;
-    }
-    else if (map_host_used + size < map_host_limit) {
-      /* Allocate host memory ourselves. */
-      mem_alloc_result = hipHostMalloc(
-          &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);
-
-      assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
-             (mem_alloc_result != hipSuccess && shared_pointer == 0));
-    }
-
-    if (mem_alloc_result == hipSuccess) {
-      hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
-      map_host_used += size;
-      status = " in host memory";
-    }
-  }
-
-  if (mem_alloc_result != hipSuccess) {
-    status = " failed, out of device and host memory";
-    set_error("System is out of GPU and shared host memory");
-  }
-
-  if (mem.name) {
-    VLOG_WORK << "Buffer allocate: " << mem.name << ", "
-              << string_human_readable_number(mem.memory_size()) << " bytes. ("
-              << string_human_readable_size(mem.memory_size()) << ")" << status;
-  }
-
-  mem.device_pointer = (device_ptr)device_pointer;
-  mem.device_size = size;
-  stats.mem_alloc(size);
-
-  if (!mem.device_pointer) {
-    return NULL;
-  }
-
-  /* Insert into map of allocations. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  HIPMem *cmem = &hip_mem_map[&mem];
-  if (shared_pointer != 0) {
-    /* Replace host pointer with our host allocation. Only works if
-     * HIP memory layout is the same and has no pitch padding. Also
-     * does not work if we move textures to host during a render,
-     * since other devices might be using the memory. */
-
-    if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
-        mem.host_pointer != shared_pointer) {
-      memcpy(shared_pointer, mem.host_pointer, size);
-
-      /* A Call to device_memory::host_free() should be preceded by
-       * a call to device_memory::device_free() for host memory
-       * allocated by a device to be handled properly. Two exceptions
-       * are here and a call in OptiXDevice::generic_alloc(), where
-       * the current host memory can be assumed to be allocated by
-       * device_memory::host_alloc(), not by a device */
-
-      mem.host_free();
-      mem.host_pointer = shared_pointer;
-    }
-    mem.shared_pointer = shared_pointer;
-    mem.shared_counter++;
-    cmem->use_mapped_host = true;
-  }
-  else {
-    cmem->use_mapped_host = false;
-  }
-
-  return cmem;
 }

-void HIPDevice::generic_copy_to(device_memory &mem)
+bool HIPDevice::alloc_device(void *&device_pointer, size_t size)
 {
-  if (!mem.host_pointer || !mem.device_pointer) {
-    return;
-  }
+  HIPContextScope scope(this);

-  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
-   * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
-   * mem.host_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
-    const HIPContextScope scope(this);
-    hip_assert(
-        hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size()));
-  }
+  hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size);
+  return mem_alloc_result == hipSuccess;
 }

-void HIPDevice::generic_free(device_memory &mem)
+void HIPDevice::free_device(void *device_pointer)
 {
-  if (mem.device_pointer) {
-    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+  HIPContextScope scope(this);

-    /* If cmem.use_mapped_host is true, reference counting is used
-     * to safely free a mapped host memory. */
+  hip_assert(hipFree((hipDeviceptr_t)device_pointer));
+}

-    if (cmem.use_mapped_host) {
-      assert(mem.shared_pointer);
-      if (mem.shared_pointer) {
-        assert(mem.shared_counter > 0);
-        if (--mem.shared_counter == 0) {
-          if (mem.host_pointer == mem.shared_pointer) {
-            mem.host_pointer = 0;
-          }
-          hipHostFree(mem.shared_pointer);
-          mem.shared_pointer = 0;
-        }
-      }
-      map_host_used -= mem.device_size;
-    }
-    else {
-      /* Free device memory. */
-      hip_assert(hipFree(mem.device_pointer));
-    }
+bool HIPDevice::alloc_host(void *&shared_pointer, size_t size)
+{
+  HIPContextScope scope(this);

-    stats.mem_free(mem.device_size);
-    mem.device_pointer = 0;
-    mem.device_size = 0;
+  hipError_t mem_alloc_result = hipHostMalloc(
+      &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined);

-    hip_mem_map.erase(hip_mem_map.find(&mem));
-  }
+  return mem_alloc_result == hipSuccess;
+}
+
+void HIPDevice::free_host(void *shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hipHostFree(shared_pointer);
+}
+
+bool HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
+{
+  HIPContextScope scope(this);
+
+  hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0));
+  return true;
+}
+
+void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
+{
+  const HIPContextScope scope(this);
+
+  hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size));
 }

 void HIPDevice::mem_alloc(device_memory &mem)
@ -823,8 +576,8 @@ void HIPDevice::mem_zero(device_memory &mem)

  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
   * regardless of mem.host_pointer and mem.shared_pointer. */
-  thread_scoped_lock lock(hip_mem_map_mutex);
-  if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
+  thread_scoped_lock lock(device_mem_map_mutex);
+  if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
    const HIPContextScope scope(this);
    hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size()));
  }
@ -951,19 +704,19 @@ void HIPDevice::tex_alloc(device_texture &mem)
      return;
  }

-  HIPMem *cmem = NULL;
+  Mem *cmem = NULL;
  hArray array_3d = NULL;
  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
  size_t dst_pitch = src_pitch;

  if (!mem.is_resident(this)) {
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;

    if (mem.data_depth > 1) {
      array_3d = (hArray)mem.device_pointer;
-      cmem->array = array_3d;
+      cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
    }
    else if (mem.data_height > 0) {
      dst_pitch = align_up(src_pitch, pitch_alignment);
@ -1007,10 +760,10 @@ void HIPDevice::tex_alloc(device_texture &mem)
    mem.device_size = size;
    stats.mem_alloc(size);

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];
    cmem->texobject = 0;
-    cmem->array = array_3d;
+    cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
  }
  else if (mem.data_height > 0) {
    /* 2D texture, using pitch aligned linear memory. */
@ -1095,8 +848,8 @@ void HIPDevice::tex_alloc(device_texture &mem)
    texDesc.filterMode = filter_mode;
    texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;

-    thread_scoped_lock lock(hip_mem_map_mutex);
-    cmem = &hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    cmem = &device_mem_map[&mem];

    hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));

@ -1111,9 +864,9 @@ void HIPDevice::tex_free(device_texture &mem)
 {
  if (mem.device_pointer) {
    HIPContextScope scope(this);
-    thread_scoped_lock lock(hip_mem_map_mutex);
-    DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
-    const HIPMem &cmem = hip_mem_map[&mem];
+    thread_scoped_lock lock(device_mem_map_mutex);
+    DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
+    const Mem &cmem = device_mem_map[&mem];

    if (cmem.texobject) {
      /* Free bindless texture. */
@ -1122,16 +875,16 @@ void HIPDevice::tex_free(device_texture &mem)

    if (!mem.is_resident(this)) {
      /* Do not free memory here, since it was allocated on a different device. */
-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else if (cmem.array) {
      /* Free array. */
-      hipArrayDestroy(cmem.array);
+      hipArrayDestroy(reinterpret_cast<hArray>(cmem.array));
      stats.mem_free(mem.device_size);
      mem.device_pointer = 0;
      mem.device_size = 0;

-      hip_mem_map.erase(hip_mem_map.find(&mem));
+      device_mem_map.erase(device_mem_map.find(&mem));
    }
    else {
      lock.unlock();
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN

 class DeviceQueue;

-class HIPDevice : public Device {
+class HIPDevice : public GPUDevice {

  friend class HIPContextScope;

@ -26,36 +26,11 @@ class HIPDevice : public Device {
  hipDevice_t hipDevice;
  hipCtx_t hipContext;
  hipModule_t hipModule;
-  size_t device_texture_headroom;
-  size_t device_working_headroom;
-  bool move_texture_to_host;
-  size_t map_host_used;
-  size_t map_host_limit;
-  int can_map_host;
  int pitch_alignment;
  int hipDevId;
  int hipDevArchitecture;
  bool first_error;

-  struct HIPMem {
-    HIPMem() : texobject(0), array(0), use_mapped_host(false)
-    {
-    }
-
-    hipTextureObject_t texobject;
-    hArray array;
-
-    /* If true, a mapped host memory in shared_pointer is being used. */
-    bool use_mapped_host;
-  };
-  typedef map<device_memory *, HIPMem> HIPMemMap;
-  HIPMemMap hip_mem_map;
-  thread_mutex hip_mem_map_mutex;
-
-  /* Bindless Textures */
-  device_vector<TextureInfo> texture_info;
-  bool need_texture_info;
-
  HIPDeviceKernels kernels;

  static bool have_precompiled_kernels();
@ -81,17 +56,13 @@ class HIPDevice : public Device {
  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);

-  void init_host_memory();
-
-  void load_texture_info();
-
-  void move_textures_to_host(size_t size, bool for_texture);
-
-  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
-
-  void generic_copy_to(device_memory &mem);
-
-  void generic_free(device_memory &mem);
+  virtual void get_device_memory_info(size_t &total, size_t &free) override;
+  virtual bool alloc_device(void *&device_pointer, size_t size) override;
+  virtual void free_device(void *device_pointer) override;
+  virtual bool alloc_host(void *&shared_pointer, size_t size) override;
+  virtual void free_host(void *shared_pointer) override;
+  virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
+  virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;

  void mem_alloc(device_memory &mem) override;

--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@ -73,6 +73,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
      return "integrator_terminated_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
      return "integrator_sorted_paths_array";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+      return "integrator_sort_bucket_pass";
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS:
+      return "integrator_sort_write_pass";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
      return "integrator_compact_paths_array";
    case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@ -247,6 +247,8 @@ class device_memory {
  bool is_resident(Device *sub_device) const;

 protected:
+  friend class Device;
+  friend class GPUDevice;
  friend class CUDADevice;
  friend class OptiXDevice;
  friend class HIPDevice;
--- a/intern/cycles/device/metal/bvh.h
+++ b/intern/cycles/device/metal/bvh.h
@ -21,6 +21,7 @@ class BVHMetal : public BVH {

  API_AVAILABLE(macos(11.0))
  vector<id<MTLAccelerationStructure>> blas_array;
+  vector<uint32_t> blas_lookup;

  bool motion_blur = false;

--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress,

    uint32_t instance_index = 0;
    uint32_t motion_transform_index = 0;
+
+    // allocate look up buffer for wost case scenario
+    uint64_t count = objects.size();
+    blas_lookup.resize(count);
+
    for (Object *ob : objects) {
      /* Skip non-traceable objects */
      if (!ob->is_traceable())
@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress,
      /* Set user instance ID to object index */
      int object_index = ob->get_device_index();
      uint32_t user_id = uint32_t(object_index);
+      int currIndex = instance_index++;
+      assert(user_id < blas_lookup.size());
+      blas_lookup[user_id] = accel_struct_index;

      /* Bake into the appropriate descriptor */
      if (motion_blur) {
        MTLAccelerationStructureMotionInstanceDescriptor *instances =
            (MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];

        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
      else {
        MTLAccelerationStructureUserIDInstanceDescriptor *instances =
            (MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
-        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
+        MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];

        desc.accelerationStructureIndex = accel_struct_index;
        desc.userID = user_id;
--- a/intern/cycles/device/metal/device.mm
+++ b/intern/cycles/device/metal/device.mm
@ -55,6 +55,10 @@ void device_metal_info(vector<DeviceInfo> &devices)
    info.denoisers = DENOISER_NONE;
    info.id = id;

+    if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
+      info.has_light_tree = false;
+    }
+
    devices.push_back(info);
    device_index++;
  }
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@ -74,6 +74,11 @@ class MetalDevice : public Device {
  id<MTLBuffer> texture_bindings_3d = nil;
  std::vector<id<MTLTexture>> texture_slot_map;

+  /* BLAS encoding & lookup */
+  id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
+  id<MTLBuffer> blas_buffer = nil;
+  id<MTLBuffer> blas_lookup_buffer = nil;
+
  bool use_metalrt = false;
  MetalPipelineType kernel_specialization_level = PSO_GENERIC;

@ -105,6 +110,8 @@ class MetalDevice : public Device {

  bool use_adaptive_compilation();

+  bool use_local_atomic_sort() const;
+
  bool make_source_and_check_if_compile_needed(MetalPipelineType pso_type);

  void make_source(MetalPipelineType pso_type, const uint kernel_features);
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@ -192,6 +192,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
        arg_desc_as.access = MTLArgumentAccessReadOnly;

+        MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
+        arg_desc_ptrs.dataType = MTLDataTypePointer;
+        arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
+
        MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
        arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
        arg_desc_ift.access = MTLArgumentAccessReadOnly;
@ -204,14 +208,28 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
        arg_desc_ift.index = index++;
        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
+        arg_desc_ift.index = index++;
+        [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
+        arg_desc_ptrs.index = index++;
+        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
+        arg_desc_ptrs.index = index++;
+        [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */

        [arg_desc_ift release];
        [arg_desc_as release];
+        [arg_desc_ptrs release];
      }
    }

    mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];

+    // preparing the blas arg encoder
+    MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
+    arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
+    arg_desc_blas.access = MTLArgumentAccessReadOnly;
+    mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
+    [arg_desc_blas release];
+
    for (int i = 0; i < ancillary_desc.count; i++) {
      [ancillary_desc[i] release];
    }
@ -271,6 +289,11 @@ bool MetalDevice::use_adaptive_compilation()
  return DebugFlags().metal.adaptive_compile;
 }

+bool MetalDevice::use_local_atomic_sort() const
+{
+  return DebugFlags().metal.use_local_atomic_sort;
+}
+
 void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
 {
  string global_defines;
@ -278,6 +301,10 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }

+  if (use_local_atomic_sort()) {
+    global_defines += "#define __KERNEL_LOCAL_ATOMIC_SORT__\n";
+  }
+
  if (use_metalrt) {
    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
@ -1231,6 +1258,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    if (@available(macos 11.0, *)) {
      if (bvh->params.top_level) {
        bvhMetalRT = bvh_metal;
+
+        // allocate required buffers for BLAS array
+        uint64_t count = bvhMetalRT->blas_array.size();
+        uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
+        blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
+        stats.mem_alloc(blas_buffer.allocatedSize);
+
+        for (uint64_t i = 0; i < count; ++i) {
+          [mtlBlasArgEncoder setArgumentBuffer:blas_buffer
+                                        offset:i * mtlBlasArgEncoder.encodedLength];
+          [mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
+        }
+
+        count = bvhMetalRT->blas_lookup.size();
+        bufferSize = sizeof(uint32_t) * count;
+        blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
+                                                    options:default_storage_mode];
+        stats.mem_alloc(blas_lookup_buffer.allocatedSize);
+
+        memcpy([blas_lookup_buffer contents],
+               bvhMetalRT -> blas_lookup.data(),
+               blas_lookup_buffer.allocatedSize);
+
+        if (default_storage_mode == MTLResourceStorageModeManaged) {
+          [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
+          [blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
+        }
      }
    }
  }
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@ -19,6 +19,8 @@ enum {
  METALRT_FUNC_SHADOW_BOX,
  METALRT_FUNC_LOCAL_TRI,
  METALRT_FUNC_LOCAL_BOX,
+  METALRT_FUNC_LOCAL_TRI_PRIM,
+  METALRT_FUNC_LOCAL_BOX_PRIM,
  METALRT_FUNC_CURVE_RIBBON,
  METALRT_FUNC_CURVE_RIBBON_SHADOW,
  METALRT_FUNC_CURVE_ALL,
@ -28,7 +30,13 @@ enum {
  METALRT_FUNC_NUM
 };

-enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
+enum {
+  METALRT_TABLE_DEFAULT,
+  METALRT_TABLE_SHADOW,
+  METALRT_TABLE_LOCAL,
+  METALRT_TABLE_LOCAL_PRIM,
+  METALRT_TABLE_NUM
+};

 /* Pipeline State Object types */
 enum MetalPipelineType {
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@ -87,6 +87,9 @@ struct ShaderCache {
          break;
      }
    }
+
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS] = {1024, 1024};
+    occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS] = {1024, 1024};
  }
  ~ShaderCache();

@ -521,6 +524,8 @@ void MetalKernelPipeline::compile()
          "__anyhit__cycles_metalrt_shadow_all_hit_box",
          "__anyhit__cycles_metalrt_local_hit_tri",
          "__anyhit__cycles_metalrt_local_hit_box",
+          "__anyhit__cycles_metalrt_local_hit_tri_prim",
+          "__anyhit__cycles_metalrt_local_hit_box_prim",
          "__intersection__curve_ribbon",
          "__intersection__curve_ribbon_shadow",
          "__intersection__curve_all",
@ -611,11 +616,17 @@ void MetalKernelPipeline::compile()
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
                         nil];
+    table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
+        arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+                         rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+                         nil];

    NSMutableSet *unique_functions = [NSMutableSet
        setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
+    [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];

    if (kernel_has_intersection(device_kernel)) {
      linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@ -25,6 +25,7 @@ class MetalDeviceQueue : public DeviceQueue {
  virtual int num_concurrent_states(const size_t) const override;
  virtual int num_concurrent_busy_states(const size_t) const override;
  virtual int num_sort_partition_elements() const override;
+  virtual bool supports_local_atomic_sort() const override;

  virtual void init_execution() override;

--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@ -315,6 +315,11 @@ int MetalDeviceQueue::num_sort_partition_elements() const
  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
 }

+bool MetalDeviceQueue::supports_local_atomic_sort() const
+{
+  return metal_device_->use_local_atomic_sort();
+}
+
 void MetalDeviceQueue::init_execution()
 {
  /* Synchronize all textures and memory copies before executing task. */
@ -477,6 +482,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (metal_device_->bvhMetalRT) {
        id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
        [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
+        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
+                                                  offset:0
+                                                 atIndex:7];
+        [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
+                                                  offset:0
+                                                 atIndex:8];
      }

      for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@ -527,6 +538,10 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      if (bvhMetalRT) {
        /* Mark all Accelerations resources as used */
        [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
+                                        usage:MTLResourceUsageRead];
+        [mtlComputeCommandEncoder useResource:metal_device_->blas_lookup_buffer
+                                        usage:MTLResourceUsageRead];
        [mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
                                         count:bvhMetalRT->blas_array.size()
                                         usage:MTLResourceUsageRead];
@ -553,13 +568,24 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
      /* See parallel_active_index.h for why this amount of shared memory is needed.
       * Rounded up to 16 bytes for Metal */
      shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
-      [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
      break;

+    case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS:
+    case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+      int key_count = metal_device_->launch_params.data.max_shaders;
+      shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
+      break;
+    }
+
    default:
      break;
  }

+  if (shared_mem_bytes) {
+    assert(shared_mem_bytes <= 32 * 1024);
+    [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
+  }
+
  MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
      divide_up(work_size, num_threads_per_block), 1, 1);
  MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@ -64,6 +64,12 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
    return METAL_GPU_INTEL;
  }
  else if (strstr(device_name, "AMD")) {
+    /* Setting this env var hides AMD devices thus exposing any integrated Intel devices. */
+    if (auto str = getenv("CYCLES_METAL_FORCE_INTEL")) {
+      if (atoi(str)) {
+        return METAL_GPU_UNKNOWN;
+      }
+    }
    return METAL_GPU_AMD;
  }
  else if (strstr(device_name, "Apple")) {
@ -96,6 +102,15 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    return usable_devices;
  }

+  /* If the system has both an AMD GPU (discrete) and an Intel one (integrated), prefer the AMD
+   * one. This can be overridden with CYCLES_METAL_FORCE_INTEL. */
+  bool has_usable_amd_gpu = false;
+  if (@available(macos 12.3, *)) {
+    for (id<MTLDevice> device in MTLCopyAllDevices()) {
+      has_usable_amd_gpu |= (get_device_vendor(device) == METAL_GPU_AMD);
+    }
+  }
+
  metal_printf("Usable Metal devices:\n");
  for (id<MTLDevice> device in MTLCopyAllDevices()) {
    string device_name = get_device_name(device);
@ -111,8 +126,10 @@ vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
    }

 #  if defined(MAC_OS_VERSION_13_0)
-    if (@available(macos 13.0, *)) {
-      usable |= (vendor == METAL_GPU_INTEL);
+    if (!has_usable_amd_gpu) {
+      if (@available(macos 13.0, *)) {
+        usable |= (vendor == METAL_GPU_INTEL);
+      }
    }
 #  endif

--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@ -377,7 +377,7 @@ void OneapiDevice::tex_alloc(device_texture &mem)
  generic_alloc(mem);
  generic_copy_to(mem);

-  /* Resize if needed. Also, in case of resize - allocate in advance for future allocs. */
+  /* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */
  const uint slot = mem.slot;
  if (slot >= texture_info_.size()) {
    texture_info_.resize(slot + 128);
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@ -854,12 +854,14 @@ bool OptiXDevice::load_osl_kernels()
        context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
  }

+  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
  vector<OptixStackSizes> osl_stack_size(osl_groups.size());

  /* Update SBT with new entries. */
  sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
  for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
    optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
+    optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
  }
  for (size_t i = 0; i < osl_groups.size(); ++i) {
    if (osl_groups[i] != NULL) {
@ -907,13 +909,15 @@ bool OptiXDevice::load_osl_kernels()
                                     0,
                                     &pipelines[PIP_SHADE]));

+    const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
+                                      stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
    unsigned int dss = 0;
    for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
      dss = std::max(dss, osl_stack_size[i].dssDC);
    }

    optix_assert(optixPipelineSetStackSize(
-        pipelines[PIP_SHADE], 0, dss, 0, pipeline_options.usesMotionBlur ? 3 : 2));
+        pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
  }

  return !have_error();
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@ -112,6 +112,13 @@ class DeviceQueue {
    return 65536;
  }

+  /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
+   * INTEGRATOR_SORT_WRITE_PASS)? */
+  virtual bool supports_local_atomic_sort() const
+  {
+    return false;
+  }
+
  /* Initialize execution of kernels on this queue.
   *
   * Will, for example, load all data required by the kernels from Device to global or path state.
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -71,6 +71,8 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
          device, "integrator_shader_mnee_sort_counter", MEM_READ_WRITE),
      integrator_shader_sort_prefix_sum_(
          device, "integrator_shader_sort_prefix_sum", MEM_READ_WRITE),
+      integrator_shader_sort_partition_key_offsets_(
+          device, "integrator_shader_sort_partition_key_offsets", MEM_READ_WRITE),
      integrator_next_main_path_index_(device, "integrator_next_main_path_index", MEM_READ_WRITE),
      integrator_next_shadow_path_index_(
          device, "integrator_next_shadow_path_index", MEM_READ_WRITE),
@ -207,33 +209,45 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
                                                                num_sort_partitions_);

-  /* Allocate arrays for shader sorting. */
-  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
-  if (integrator_shader_sort_counter_.size() < sort_buckets) {
-    integrator_shader_sort_counter_.alloc(sort_buckets);
-    integrator_shader_sort_counter_.zero_to_device();
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
-        (int *)integrator_shader_sort_counter_.device_pointer;
-
-    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
-    integrator_shader_sort_prefix_sum_.zero_to_device();
-  }
-
-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
-    if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
-      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
-      integrator_shader_raytrace_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
-          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    /* Allocate array for partitioned shader sorting using local atomics. */
+    const int num_offsets = (device_scene_->data.max_shaders + 1) * num_sort_partitions_;
+    if (integrator_shader_sort_partition_key_offsets_.size() < num_offsets) {
+      integrator_shader_sort_partition_key_offsets_.alloc(num_offsets);
+      integrator_shader_sort_partition_key_offsets_.zero_to_device();
    }
+    integrator_state_gpu_.sort_partition_key_offsets =
+        (int *)integrator_shader_sort_partition_key_offsets_.device_pointer;
  }
+  else {
+    /* Allocate arrays for shader sorting. */
+    const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+    if (integrator_shader_sort_counter_.size() < sort_buckets) {
+      integrator_shader_sort_counter_.alloc(sort_buckets);
+      integrator_shader_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+          (int *)integrator_shader_sort_counter_.device_pointer;

-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
-    if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
-      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
-      integrator_shader_mnee_sort_counter_.zero_to_device();
-      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
-          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
+      integrator_shader_sort_prefix_sum_.zero_to_device();
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      if (integrator_shader_raytrace_sort_counter_.size() < sort_buckets) {
+        integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
+        integrator_shader_raytrace_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+            (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+      }
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+      if (integrator_shader_mnee_sort_counter_.size() < sort_buckets) {
+        integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+        integrator_shader_mnee_sort_counter_.zero_to_device();
+        integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+            (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+      }
    }
  }
 }
@ -451,8 +465,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
    work_size = num_queued;
    d_path_index = queued_paths_.device_pointer;

-    compute_sorted_queued_paths(
-        DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, kernel, num_paths_limit);
+    compute_sorted_queued_paths(kernel, num_paths_limit);
  }
  else if (num_queued < work_size) {
    work_size = num_queued;
@ -511,11 +524,26 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel, const int num
  }
 }

-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
-                                                   DeviceKernel queued_kernel,
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel queued_kernel,
                                                   const int num_paths_limit)
 {
  int d_queued_kernel = queued_kernel;
+
+  /* Launch kernel to fill the active paths arrays. */
+  if (num_sort_partitions_ > 1 && queue_->supports_local_atomic_sort()) {
+    const int work_size = kernel_max_active_main_path_index(queued_kernel);
+    device_ptr d_queued_paths = queued_paths_.device_pointer;
+
+    int partition_size = (int)integrator_state_gpu_.sort_partition_divisor;
+
+    DeviceKernelArguments args(
+        &work_size, &partition_size, &num_paths_limit, &d_queued_paths, &d_queued_kernel);
+
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS, 1024 * num_sort_partitions_, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS, 1024 * num_sort_partitions_, args);
+    return;
+  }
+
  device_ptr d_counter = (device_ptr)integrator_state_gpu_.sort_key_counter[d_queued_kernel];
  device_ptr d_prefix_sum = integrator_shader_sort_prefix_sum_.device_pointer;
  assert(d_counter != 0 && d_prefix_sum != 0);
@ -552,7 +580,7 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
                               &d_prefix_sum,
                               &d_queued_kernel);

-    queue_->enqueue(kernel, work_size, args);
+    queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY, work_size, args);
  }
 }

--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@ -70,9 +70,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  void enqueue_path_iteration(DeviceKernel kernel, const int num_paths_limit = INT_MAX);

  void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
-  void compute_sorted_queued_paths(DeviceKernel kernel,
-                                   DeviceKernel queued_kernel,
-                                   const int num_paths_limit);
+  void compute_sorted_queued_paths(DeviceKernel queued_kernel, const int num_paths_limit);

  void compact_main_paths(const int num_active_paths);
  void compact_shadow_paths();
@ -135,6 +133,7 @@ class PathTraceWorkGPU : public PathTraceWork {
  device_vector<int> integrator_shader_raytrace_sort_counter_;
  device_vector<int> integrator_shader_mnee_sort_counter_;
  device_vector<int> integrator_shader_sort_prefix_sum_;
+  device_vector<int> integrator_shader_sort_partition_key_offsets_;
  /* Path split. */
  device_vector<int> integrator_next_main_path_index_;
  device_vector<int> integrator_next_shadow_path_index_;
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@ -661,4 +661,38 @@ ccl_device void bsdf_blur(KernelGlobals kg, ccl_private ShaderClosure *sc, float
 #endif
 }

+ccl_device_inline Spectrum bsdf_albedo(ccl_private const ShaderData *sd,
+                                       ccl_private const ShaderClosure *sc)
+{
+  Spectrum albedo = sc->weight;
+  /* Some closures include additional components such as Fresnel terms that cause their albedo to
+   * be below 1. The point of this function is to return a best-effort estimation of their albedo,
+   * meaning the amount of reflected/refracted light that would be expected when illuminated by a
+   * uniform white background.
+   * This is used for the denoising albedo pass and diffuse/glossy/transmission color passes.
+   * NOTE: This should always match the sample_weight of the closure - as in, if there's an albedo
+   * adjustment in here, the sample_weight should also be reduced accordingly.
+   * TODO(lukas): Consider calling this function to determine the sample_weight? Would be a bit of
+   * extra overhead though. */
+#if defined(__SVM__) || defined(__OSL__)
+  switch (sc->type) {
+    case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
+    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
+    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
+    case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
+      albedo *= microfacet_fresnel((ccl_private const MicrofacetBsdf *)sc, sd->wi, sc->N);
+      break;
+    case CLOSURE_BSDF_PRINCIPLED_SHEEN_ID:
+      albedo *= ((ccl_private const PrincipledSheenBsdf *)sc)->avg_value;
+      break;
+    case CLOSURE_BSDF_HAIR_PRINCIPLED_ID:
+      albedo *= bsdf_principled_hair_albedo(sc);
+      break;
+    default:
+      break;
+  }
+#endif
+  return albedo;
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@ -23,8 +23,6 @@ enum MicrofacetType {

 typedef struct MicrofacetExtra {
  Spectrum color, cspec0;
-  Spectrum fresnel_color;
-  float clearcoat;
 } MicrofacetExtra;

 typedef struct MicrofacetBsdf {
@ -184,26 +182,25 @@ ccl_device_forceinline float3 microfacet_ggx_sample_vndf(const float3 wi,
 *
 * Else it is simply white
 */
-ccl_device_forceinline Spectrum reflection_color(ccl_private const MicrofacetBsdf *bsdf,
-                                                 float3 L,
-                                                 float3 H)
+ccl_device_forceinline Spectrum microfacet_fresnel(ccl_private const MicrofacetBsdf *bsdf,
+                                                   float3 wi,
+                                                   float3 H)
 {
-  Spectrum F = one_spectrum();
-
-  bool use_clearcoat = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;
-  bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || use_clearcoat);
-
-  if (use_fresnel) {
-    float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
-
-    F = interpolate_fresnel_color(L, H, bsdf->ior, F0, bsdf->extra->cspec0);
+  if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(bsdf->type)) {
+    return interpolate_fresnel_color(wi, H, bsdf->ior, bsdf->extra->cspec0);
  }
-
-  if (use_clearcoat) {
-    F *= 0.25f * bsdf->extra->clearcoat;
+  else if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
+    return make_spectrum(fresnel_dielectric_cos(dot(wi, H), bsdf->ior));
  }
+  else {
+    return one_spectrum();
+  }
+}

-  return F;
+ccl_device_forceinline void bsdf_microfacet_adjust_weight(ccl_private const ShaderData *sd,
+                                                          ccl_private MicrofacetBsdf *bsdf)
+{
+  bsdf->sample_weight *= average(microfacet_fresnel(bsdf, sd->wi, bsdf->N));
 }

 /* Generalized Trowbridge-Reitz for clearcoat. */
@ -292,22 +289,6 @@ ccl_device_inline float bsdf_aniso_D(float alpha_x, float alpha_y, float3 H)
  }
 }

-ccl_device_forceinline void bsdf_microfacet_fresnel_color(ccl_private const ShaderData *sd,
-                                                          ccl_private MicrofacetBsdf *bsdf)
-{
-  kernel_assert(CLOSURE_IS_BSDF_MICROFACET_FRESNEL(bsdf->type));
-
-  float F0 = fresnel_dielectric_cos(1.0f, bsdf->ior);
-  bsdf->extra->fresnel_color = interpolate_fresnel_color(
-      sd->wi, bsdf->N, bsdf->ior, F0, bsdf->extra->cspec0);
-
-  if (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID) {
-    bsdf->extra->fresnel_color *= 0.25f * bsdf->extra->clearcoat;
-  }
-
-  bsdf->sample_weight *= average(bsdf->extra->fresnel_color);
-}
-
 template<MicrofacetType m_type>
 ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,
                                         const float3 Ng,
@ -380,8 +361,7 @@ ccl_device Spectrum bsdf_microfacet_eval(ccl_private const ShaderClosure *sc,

  *pdf = common / (1.0f + lambdaI);

-  const Spectrum F = m_refractive ? one_spectrum() : reflection_color(bsdf, wo, H);
-
+  const Spectrum F = microfacet_fresnel(bsdf, wo, H);
  return F * common / (1.0f + lambdaO + lambdaI);
 }

@ -463,14 +443,7 @@ ccl_device int bsdf_microfacet_sample(ccl_private const ShaderClosure *sc,
    label |= LABEL_SINGULAR;
    /* Some high number for MIS. */
    *pdf = 1e6f;
-    *eval = make_spectrum(1e6f);
-
-    bool use_fresnel = (bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID ||
-                        bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID);
-
-    if (use_fresnel && !m_refractive) {
-      *eval *= reflection_color(bsdf, *wo, H);
-    }
+    *eval = make_spectrum(1e6f) * microfacet_fresnel(bsdf, *wo, H);
  }
  else {
    label |= LABEL_GLOSSY;
@ -511,8 +484,7 @@ ccl_device int bsdf_microfacet_sample(ccl_private const ShaderClosure *sc,

    *pdf = common / (1.0f + lambdaI);

-    Spectrum F = m_refractive ? one_spectrum() : reflection_color(bsdf, *wo, H);
-
+    Spectrum F = microfacet_fresnel(bsdf, *wo, H);
    *eval = F * common / (1.0f + lambdaI + lambdaO);
  }

@ -547,14 +519,6 @@ ccl_device int bsdf_microfacet_ggx_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }

-/* Required to maintain OSL interface. */
-ccl_device int bsdf_microfacet_ggx_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  return bsdf_microfacet_ggx_setup(bsdf);
-}
-
 ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
                                                 ccl_private const ShaderData *sd)
 {
@ -565,7 +529,7 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsd

  bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID;

-  bsdf_microfacet_fresnel_color(sd, bsdf);
+  bsdf_microfacet_adjust_weight(sd, bsdf);

  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
@ -573,14 +537,12 @@ ccl_device int bsdf_microfacet_ggx_fresnel_setup(ccl_private MicrofacetBsdf *bsd
 ccl_device int bsdf_microfacet_ggx_clearcoat_setup(ccl_private MicrofacetBsdf *bsdf,
                                                   ccl_private const ShaderData *sd)
 {
-  bsdf->extra->cspec0 = saturate(bsdf->extra->cspec0);
-
  bsdf->alpha_x = saturatef(bsdf->alpha_x);
  bsdf->alpha_y = bsdf->alpha_x;

  bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID;

-  bsdf_microfacet_fresnel_color(sd, bsdf);
+  bsdf_microfacet_adjust_weight(sd, bsdf);

  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }
@ -643,14 +605,6 @@ ccl_device int bsdf_microfacet_beckmann_setup(ccl_private MicrofacetBsdf *bsdf)
  return SD_BSDF | SD_BSDF_HAS_EVAL;
 }

-/* Required to maintain OSL interface. */
-ccl_device int bsdf_microfacet_beckmann_isotropic_setup(ccl_private MicrofacetBsdf *bsdf)
-{
-  bsdf->alpha_y = bsdf->alpha_x;
-
-  return bsdf_microfacet_beckmann_setup(bsdf);
-}
-
 ccl_device int bsdf_microfacet_beckmann_refraction_setup(ccl_private MicrofacetBsdf *bsdf)
 {
  bsdf->alpha_x = saturatef(bsdf->alpha_x);
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@ -401,7 +401,7 @@ ccl_device int bsdf_microfacet_multi_ggx_fresnel_setup(ccl_private MicrofacetBsd

  bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID;

-  bsdf_microfacet_fresnel_color(sd, bsdf);
+  bsdf_microfacet_adjust_weight(sd, bsdf);

  return bsdf_microfacet_multi_ggx_common_setup(bsdf);
 }
@ -575,7 +575,7 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(ccl_private Microfa

  bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID;

-  bsdf_microfacet_fresnel_color(sd, bsdf);
+  bsdf_microfacet_adjust_weight(sd, bsdf);

  return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
 }
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@ -73,9 +73,8 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
  eval = make_spectrum(val);
 #endif

-  float F0 = fresnel_dielectric_cos(1.0f, eta);
  if (use_fresnel) {
-    throughput = interpolate_fresnel_color(wi, wh, eta, F0, cspec0);
+    throughput = interpolate_fresnel_color(wi, wh, eta, cspec0);

    eval *= throughput;
  }
@ -144,11 +143,11 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi,
        throughput *= color;
      }
      else if (use_fresnel && order > 0) {
-        throughput *= interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
+        throughput *= interpolate_fresnel_color(wi_prev, wm, eta, cspec0);
      }
 #else /* MF_MULTI_GLOSSY */
      if (use_fresnel && order > 0) {
-        throughput *= interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
+        throughput *= interpolate_fresnel_color(-wr, wm, eta, cspec0);
      }
      wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
@ -192,8 +191,6 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
  float G1_r = 0.0f;
  bool outside = true;

-  float F0 = fresnel_dielectric_cos(1.0f, eta);
-
  int order;
  for (order = 0; order < 10; order++) {
    /* Sample microfacet height. */
@ -229,22 +226,12 @@ ccl_device_forceinline Spectrum MF_FUNCTION_FULL_NAME(mf_sample)(float3 wi,
        throughput *= color;
      }
      else {
-        Spectrum t_color = interpolate_fresnel_color(wi_prev, wm, eta, F0, cspec0);
-
-        if (order == 0)
-          throughput = t_color;
-        else
-          throughput *= t_color;
+        throughput *= interpolate_fresnel_color(wi_prev, wm, eta, cspec0);
      }
    }
 #else /* MF_MULTI_GLOSSY */
    if (use_fresnel) {
-      Spectrum t_color = interpolate_fresnel_color(-wr, wm, eta, F0, cspec0);
-
-      if (order == 0)
-        throughput = t_color;
-      else
-        throughput *= t_color;
+      throughput *= interpolate_fresnel_color(-wr, wm, eta, cspec0);
    }
    wr = mf_sample_phase_glossy(-wr, &throughput, wm);
 #endif
--- a/intern/cycles/kernel/closure/bsdf_util.h
+++ b/intern/cycles/kernel/closure/bsdf_util.h
@ -89,19 +89,21 @@ ccl_device float schlick_fresnel(float u)
  return m2 * m2 * m;  // pow(m, 5)
 }

-/* Calculate the fresnel color which is a blend between white and the F0 color (cspec0) */
-ccl_device_forceinline Spectrum
-interpolate_fresnel_color(float3 L, float3 H, float ior, float F0, Spectrum cspec0)
+/* Calculate the fresnel color, which is a blend between white and the F0 color */
+ccl_device_forceinline Spectrum interpolate_fresnel_color(float3 L,
+                                                          float3 H,
+                                                          float ior,
+                                                          Spectrum F0)
 {
-  /* Calculate the fresnel interpolation factor
-   * The value from fresnel_dielectric_cos(...) has to be normalized because
-   * the cspec0 keeps the F0 color
-   */
-  float F0_norm = 1.0f / (1.0f - F0);
-  float FH = (fresnel_dielectric_cos(dot(L, H), ior) - F0) * F0_norm;
+  /* Compute the real Fresnel term and remap it from real_F0..1 to F0..1.
+   * The reason why we use this remapping instead of directly doing the
+   * Schlick approximation lerp(F0, 1.0, (1.0-cosLH)^5) is that for cases
+   * with similar IORs (e.g. ice in water), the relative IOR can be close
+   * enough to 1.0 that the Schlick approximation becomes inaccurate. */
+  float real_F = fresnel_dielectric_cos(dot(L, H), ior);
+  float real_F0 = fresnel_dielectric_cos(1.0f, ior);

-  /* Blend between white and a specular color with respect to the fresnel */
-  return cspec0 * (1.0f - FH) + make_spectrum(FH);
+  return mix(F0, one_spectrum(), inverse_lerp(real_F0, 1.0f, real_F));
 }

 ccl_device float3 ensure_valid_reflection(float3 Ng, float3 I, float3 N)
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@ -401,6 +401,72 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
 }
 ccl_gpu_kernel_postfix

+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_bucket_pass(num_states,
+                                partition_size,
+                                max_shaders,
+                                kernel_index,
+                                d_queued_kernel,
+                                d_shader_sort_key,
+                                key_offsets,
+                                (threadgroup int *)threadgroup_array,
+                                metal_local_id,
+                                metal_local_size,
+                                metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_write_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index)
+{
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
+                                           kernel_integrator_state.path.queued_kernel;
+  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
+                                           kernel_integrator_state.path.shader_sort_key;
+  ccl_global int *key_offsets = (ccl_global int *)
+                                    kernel_integrator_state.sort_partition_key_offsets;
+
+  gpu_parallel_sort_write_pass(num_states,
+                               partition_size,
+                               max_shaders,
+                               kernel_index,
+                               num_states_limit,
+                               indices,
+                               d_queued_kernel,
+                               d_shader_sort_key,
+                               key_offsets,
+                               (threadgroup int *)threadgroup_array,
+                               metal_local_id,
+                               metal_local_size,
+                               metal_grid_id);
+#endif
+}
+ccl_gpu_kernel_postfix
+
 ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_compact_paths_array,
                             int num_states,
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@ -178,7 +178,7 @@ __device__
                                         simd_lane_index, \
                                         simd_group_index, \
                                         num_simd_groups, \
-                                         simdgroup_offset)
+                                         (threadgroup int *)threadgroup_array)
 #elif defined(__KERNEL_ONEAPI__)

 #  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@ -19,6 +19,115 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
+#define GPU_PARALLEL_SORT_BLOCK_SIZE 1024
+
+#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+
+#  define atomic_store_local(p, x) \
+    atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
+#  define atomic_load_local(p) \
+    atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
+
+ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
+                                                     const uint partition_size,
+                                                     const uint max_shaders,
+                                                     const uint queued_kernel,
+                                                     ccl_global ushort *d_queued_kernel,
+                                                     ccl_global uint *d_shader_sort_key,
+                                                     ccl_global int *partition_key_offsets,
+                                                     ccl_gpu_shared int *buckets,
+                                                     const ushort local_id,
+                                                     const ushort local_size,
+                                                     const ushort grid_id)
+{
+  /* Zero the bucket sizes. */
+  if (local_id < max_shaders) {
+    atomic_store_local(&buckets[local_id], 0);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Determine bucket sizes within the partitions. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      atomic_fetch_and_add_uint32(&buckets[key], 1);
+    }
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
+
+  if (local_id == 0) {
+    int offset = 0;
+    for (int i = 0; i < max_shaders; i++) {
+      partition_key_offsets[i + uint(grid_id) * (max_shaders + 1)] = offset;
+      offset = offset + atomic_load_local(&buckets[i]);
+    }
+
+    /* Store the number of active states in this partition. */
+    partition_key_offsets[max_shaders + uint(grid_id) * (max_shaders + 1)] = offset;
+  }
+}
+
+ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
+                                                    const uint partition_size,
+                                                    const uint max_shaders,
+                                                    const uint queued_kernel,
+                                                    const int num_states_limit,
+                                                    ccl_global int *indices,
+                                                    ccl_global ushort *d_queued_kernel,
+                                                    ccl_global uint *d_shader_sort_key,
+                                                    ccl_global int *partition_key_offsets,
+                                                    ccl_gpu_shared int *local_offset,
+                                                    const ushort local_id,
+                                                    const ushort local_size,
+                                                    const ushort grid_id)
+{
+  /* Calculate each partition's global offset from the prefix sum of the active state counts per
+   * partition. */
+
+  if (local_id < max_shaders) {
+    int partition_offset = 0;
+    for (int i = 0; i < uint(grid_id); i++) {
+      int partition_key_count = partition_key_offsets[max_shaders + uint(i) * (max_shaders + 1)];
+      partition_offset += partition_key_count;
+    }
+
+    ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * (max_shaders + 1));
+    atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
+  }
+
+  ccl_gpu_syncthreads();
+
+  /* Write the sorted active indices. */
+
+  const uint partition_start = partition_size * uint(grid_id);
+  const uint partition_end = min(num_states, partition_start + partition_size);
+
+  ccl_global int *key_offsets = partition_key_offsets + (uint(grid_id) * max_shaders);
+
+  for (int state_index = partition_start + uint(local_id); state_index < partition_end;
+       state_index += uint(local_size)) {
+    ushort kernel_index = d_queued_kernel[state_index];
+    if (kernel_index == queued_kernel) {
+      uint key = d_shader_sort_key[state_index] % max_shaders;
+      int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
+      if (index < num_states_limit) {
+        indices[index] = state_index;
+      }
+    }
+  }
+}
+
+#endif /* __KERNEL_LOCAL_ATOMIC_SORT__ */

 template<typename GetKeyOp>
 __device__ void gpu_parallel_sorted_index_array(const uint state_index,
--- a/intern/cycles/kernel/device/metal/bvh.h
+++ b/intern/cycles/kernel/device/metal/bvh.h
@ -172,17 +172,14 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
    kernel_assert(!"Invalid ift_local");
    return false;
  }
-#  endif
-
-  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
-  metalrt_intersector_type metalrt_intersect;
-
-  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
-
-  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
-  if (triangle_only) {
-    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  if (is_null_intersection_function_table(metal_ancillaries->ift_local_prim)) {
+    if (local_isect) {
+      local_isect->num_hits = 0;
+    }
+    kernel_assert(!"Invalid ift_local_prim");
+    return false;
  }
+#  endif

  MetalRTIntersectionLocalPayload payload;
  payload.self = ray->self;
@ -195,14 +192,48 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
  }
  payload.result = false;

-  typename metalrt_intersector_type::result_type intersection;
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);

 #  if defined(__METALRT_MOTION__)
+  metalrt_intersector_type metalrt_intersect;
+  typename metalrt_intersector_type::result_type intersection;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
+  if (triangle_only) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
  intersection = metalrt_intersect.intersect(
      r, metal_ancillaries->accel_struct, 0xFF, ray->time, metal_ancillaries->ift_local, payload);
 #  else
+
+  metalrt_blas_intersector_type metalrt_intersect;
+  typename metalrt_blas_intersector_type::result_type intersection;
+
+  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
+  bool triangle_only = !kernel_data.bvh.have_curves && !kernel_data.bvh.have_points;
+  if (triangle_only) {
+    metalrt_intersect.assume_geometry_type(metal::raytracing::geometry_type::triangle);
+  }
+
+  // if we know we are going to get max one hit, like for random-sss-walk we can
+  // optimize and accept the first hit
+  if (max_hits == 1) {
+    metalrt_intersect.accept_any_intersection(true);
+  }
+
+  int blas_index = metal_ancillaries->blas_userID_to_index_lookUp[local_object];
+  // transform the ray into object's local space
+  Transform itfm = kernel_data_fetch(objects, local_object).itfm;
+  r.origin = transform_point(&itfm, r.origin);
+  r.direction = transform_direction(&itfm, r.direction);
+
  intersection = metalrt_intersect.intersect(
-      r, metal_ancillaries->accel_struct, 0xFF, metal_ancillaries->ift_local, payload);
+      r,
+      metal_ancillaries->blas_accel_structs[blas_index].blas,
+      metal_ancillaries->ift_local_prim,
+      payload);
 #  endif

  if (lcg_state) {
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@ -105,10 +105,11 @@ struct kernel_gpu_##name \
 { \
  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
  void run(thread MetalKernelContext& context, \
-           threadgroup int *simdgroup_offset, \
+           threadgroup atomic_int *threadgroup_array, \
           const uint metal_global_id, \
           const ushort metal_local_id, \
           const ushort metal_local_size, \
+           const ushort metal_grid_id, \
           uint simdgroup_size, \
           uint simd_lane_index, \
           uint simd_group_index, \
@ -117,22 +118,24 @@ struct kernel_gpu_##name \
 kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
                                constant MetalAncillaries *_metal_ancillaries, \
-                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                threadgroup atomic_int *threadgroup_array[[ threadgroup(0) ]], \
                                const uint metal_global_id [[thread_position_in_grid]], \
                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                const ushort metal_grid_id    [[threadgroup_position_in_grid]], \
                                uint simdgroup_size [[threads_per_simdgroup]], \
                                uint simd_lane_index [[thread_index_in_simdgroup]], \
                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
-  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+  params_struct->run(context, threadgroup_array, metal_global_id, metal_local_id, metal_local_size, metal_grid_id, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
 } \
 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
-                  threadgroup int *simdgroup_offset, \
+                  threadgroup atomic_int *threadgroup_array, \
                  const uint metal_global_id, \
                  const ushort metal_local_id, \
                  const ushort metal_local_size, \
+                  const ushort metal_grid_id, \
                  uint simdgroup_size, \
                  uint simd_lane_index, \
                  uint simd_group_index, \
@ -263,13 +266,25 @@ ccl_device_forceinline uchar4 make_uchar4(const uchar x,

 #  if defined(__METALRT_MOTION__)
 #    define METALRT_TAGS instancing, instance_motion, primitive_motion
+#    define METALRT_BLAS_TAGS , primitive_motion
 #  else
 #    define METALRT_TAGS instancing
+#    define METALRT_BLAS_TAGS
 #  endif /* __METALRT_MOTION__ */

 typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
 typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
 typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
+#  if defined(__METALRT_MOTION__)
+typedef acceleration_structure<primitive_motion> metalrt_blas_as_type;
+typedef intersection_function_table<triangle_data, primitive_motion> metalrt_blas_ift_type;
+typedef metal::raytracing::intersector<triangle_data, primitive_motion>
+    metalrt_blas_intersector_type;
+#  else
+typedef acceleration_structure<> metalrt_blas_as_type;
+typedef intersection_function_table<triangle_data> metalrt_blas_ift_type;
+typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_type;
+#  endif

 #endif /* __METALRT__ */

@ -282,6 +297,12 @@ struct Texture3DParamsMetal {
  texture3d<float, access::sample> tex;
 };

+#ifdef __METALRT__
+struct MetalRTBlasWrapper {
+  metalrt_blas_as_type blas;
+};
+#endif
+
 struct MetalAncillaries {
  device Texture2DParamsMetal *textures_2d;
  device Texture3DParamsMetal *textures_3d;
@ -291,6 +312,9 @@ struct MetalAncillaries {
  metalrt_ift_type ift_default;
  metalrt_ift_type ift_shadow;
  metalrt_ift_type ift_local;
+  metalrt_blas_ift_type ift_local_prim;
+  constant MetalRTBlasWrapper *blas_accel_structs;
+  constant int *blas_userID_to_index_lookUp;
 #endif
 };

--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@ -139,6 +139,20 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
 #endif
 }

+[[intersection(triangle, triangle_data )]] TriangleIntersectionResult
+__anyhit__cycles_metalrt_local_hit_tri_prim(
+    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
+    ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
+    uint primitive_id [[primitive_id]],
+    float2 barycentrics [[barycentric_coord]],
+    float ray_tmax [[distance]])
+{
+  //instance_id, aka the user_id has been removed. If we take this function we optimized the
+  //SSS for starting traversal from a primitive acceleration structure instead of the root of the global AS.
+  //this means we will always be intersecting the correct object no need for the userid to check
+  return metalrt_local_hit<TriangleIntersectionResult, METALRT_HIT_TRIANGLE>(
+      launch_params_metal, payload, payload.local_object, primitive_id, barycentrics, ray_tmax);
+}
 [[intersection(triangle, triangle_data, METALRT_TAGS)]] TriangleIntersectionResult
 __anyhit__cycles_metalrt_local_hit_tri(
    constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
@ -163,6 +177,17 @@ __anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
  return result;
 }

+[[intersection(bounding_box, triangle_data )]] BoundingBoxIntersectionResult
+__anyhit__cycles_metalrt_local_hit_box_prim(const float ray_tmax [[max_distance]])
+{
+  /* unused function */
+  BoundingBoxIntersectionResult result;
+  result.distance = ray_tmax;
+  result.accept = false;
+  result.continue_search = false;
+  return result;
+}
+
 template<uint intersection_type>
 bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
                            ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload,
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@ -372,6 +372,16 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
          break;
        }
+        case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
+          break;
+        }
+        case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
+          oneapi_call(
+              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
+          break;
+        }
        case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
          oneapi_call(kg,
                      cgh,
--- a/intern/cycles/kernel/film/denoising_passes.h
+++ b/intern/cycles/kernel/film/denoising_passes.h
@ -58,23 +58,7 @@ ccl_device_forceinline void film_write_denoising_features_surface(KernelGlobals
    normal += sc->N * sc->sample_weight;
    sum_weight += sc->sample_weight;

-    Spectrum closure_albedo = sc->weight;
-    /* Closures that include a Fresnel term typically have weights close to 1 even though their
-     * actual contribution is significantly lower.
-     * To account for this, we scale their weight by the average fresnel factor (the same is also
-     * done for the sample weight in the BSDF setup, so we don't need to scale that here). */
-    if (CLOSURE_IS_BSDF_MICROFACET_FRESNEL(sc->type)) {
-      ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)sc;
-      closure_albedo *= bsdf->extra->fresnel_color;
-    }
-    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_SHEEN_ID) {
-      ccl_private PrincipledSheenBsdf *bsdf = (ccl_private PrincipledSheenBsdf *)sc;
-      closure_albedo *= bsdf->avg_value;
-    }
-    else if (sc->type == CLOSURE_BSDF_HAIR_PRINCIPLED_ID) {
-      closure_albedo *= bsdf_principled_hair_albedo(sc);
-    }
-    else if (sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) {
+    if (sc->type == CLOSURE_BSDF_PRINCIPLED_DIFFUSE_ID) {
      /* BSSRDF already accounts for weight, retro-reflection would double up. */
      ccl_private const PrincipledDiffuseBsdf *bsdf = (ccl_private const PrincipledDiffuseBsdf *)
          sc;
@ -83,6 +67,7 @@ ccl_device_forceinline void film_write_denoising_features_surface(KernelGlobals
      }
    }

+    Spectrum closure_albedo = bsdf_albedo(sd, sc);
    if (bsdf_get_specular_roughness_squared(sc) > sqr(0.075f)) {
      diffuse_albedo += closure_albedo;
      sum_nonspecular_weight += sc->sample_weight;
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@ -132,6 +132,9 @@ typedef struct IntegratorStateGPU {
  /* Index of main path which will be used by a next shadow catcher split.  */
  ccl_global int *next_main_path_index;

+  /* Partition/key offsets used when writing sorted active indices. */
+  ccl_global int *sort_partition_key_offsets;
+
  /* Divisor used to partition active indices by locality when sorting by material.  */
  uint sort_partition_divisor;
 } IntegratorStateGPU;
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@ -115,6 +115,13 @@ ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

@ -130,6 +137,13 @@ ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+
+#  if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
+  if (!kernel_integrator_state.sort_key_counter[next_kernel]) {
+    return;
+  }
+#  endif
+
  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
 }

--- a/intern/cycles/kernel/integrator/surface_shader.h
+++ b/intern/cycles/kernel/integrator/surface_shader.h
@ -621,7 +621,7 @@ ccl_device Spectrum surface_shader_diffuse(KernelGlobals kg, ccl_private const S
    ccl_private const ShaderClosure *sc = &sd->closure[i];

    if (CLOSURE_IS_BSDF_DIFFUSE(sc->type) || CLOSURE_IS_BSSRDF(sc->type))
-      eval += sc->weight;
+      eval += bsdf_albedo(sd, sc);
  }

  return eval;
@ -635,7 +635,7 @@ ccl_device Spectrum surface_shader_glossy(KernelGlobals kg, ccl_private const Sh
    ccl_private const ShaderClosure *sc = &sd->closure[i];

    if (CLOSURE_IS_BSDF_GLOSSY(sc->type))
-      eval += sc->weight;
+      eval += bsdf_albedo(sd, sc);
  }

  return eval;
@ -649,7 +649,7 @@ ccl_device Spectrum surface_shader_transmission(KernelGlobals kg, ccl_private co
    ccl_private const ShaderClosure *sc = &sd->closure[i];

    if (CLOSURE_IS_BSDF_TRANSMISSION(sc->type))
-      eval += sc->weight;
+      eval += bsdf_albedo(sd, sc);
  }

  return eval;
--- a/intern/cycles/kernel/light/spot.h
+++ b/intern/cycles/kernel/light/spot.h
@ -7,24 +7,13 @@

 CCL_NAMESPACE_BEGIN

-ccl_device float spot_light_attenuation(float3 dir,
-                                        float cos_half_spot_angle,
-                                        float spot_smooth,
-                                        float3 N)
+ccl_device float spot_light_attenuation(const ccl_global KernelSpotLight *spot, float3 ray)
 {
-  float attenuation = dot(dir, N);
+  const float3 scaled_ray = safe_normalize(
+      make_float3(dot(ray, spot->axis_u), dot(ray, spot->axis_v), dot(ray, spot->dir)) /
+      spot->len);

-  if (attenuation <= cos_half_spot_angle) {
-    attenuation = 0.0f;
-  }
-  else {
-    float t = attenuation - cos_half_spot_angle;
-
-    if (t < spot_smooth && spot_smooth != 0.0f)
-      attenuation *= smoothstepf(t / spot_smooth);
-  }
-
-  return attenuation;
+  return smoothstepf((scaled_ray.z - spot->cos_half_spot_angle) / spot->spot_smooth);
 }

 template<bool in_volume_segment>
@ -57,8 +46,7 @@ ccl_device_inline bool spot_light_sample(const ccl_global KernelLight *klight,
  ls->eval_fac = (0.25f * M_1_PI_F) * invarea;

  /* spot light attenuation */
-  ls->eval_fac *= spot_light_attenuation(
-      klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, -ls->D);
+  ls->eval_fac *= spot_light_attenuation(&klight->spot, -ls->D);
  if (!in_volume_segment && ls->eval_fac == 0.0f) {
    return false;
  }
@ -87,8 +75,7 @@ ccl_device_forceinline void spot_light_update_position(const ccl_global KernelLi
  ls->pdf = invarea;

  /* spot light attenuation */
-  ls->eval_fac *= spot_light_attenuation(
-      klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, ls->Ng);
+  ls->eval_fac *= spot_light_attenuation(&klight->spot, ls->Ng);
 }

 ccl_device_inline bool spot_light_intersect(const ccl_global KernelLight *klight,
@ -129,8 +116,7 @@ ccl_device_inline bool spot_light_sample_from_intersection(
  ls->pdf = invarea;

  /* spot light attenuation */
-  ls->eval_fac *= spot_light_attenuation(
-      klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, -ls->D);
+  ls->eval_fac *= spot_light_attenuation(&klight->spot, -ls->D);

  if (ls->eval_fac == 0.0f) {
    return false;
--- a/intern/cycles/kernel/osl/closures_setup.h
+++ b/intern/cycles/kernel/osl/closures_setup.h
@ -209,14 +209,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
      closure->distribution == make_string("default", 4430693559278735917ull)) {
    if (!closure->refract) {
-      if (closure->alpha_x == closure->alpha_y) {
-        /* Isotropic */
-        sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
-      }
-      else {
-        /* Anisotropic */
-        sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
-      }
+      sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
    }
    else {
      sd->flag |= bsdf_microfacet_ggx_refraction_setup(bsdf);
@ -225,14 +218,7 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  /* Beckmann */
  else {
    if (!closure->refract) {
-      if (closure->alpha_x == closure->alpha_y) {
-        /* Isotropic */
-        sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
-      }
-      else {
-        /* Anisotropic */
-        sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
-      }
+      sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
    }
    else {
      sd->flag |= bsdf_microfacet_beckmann_refraction_setup(bsdf);
@ -258,9 +244,9 @@ ccl_device void osl_closure_microfacet_ggx_setup(
  }

  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
-  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;

-  sd->flag |= bsdf_microfacet_ggx_isotropic_setup(bsdf);
+  sd->flag |= bsdf_microfacet_ggx_setup(bsdf);
 }

 ccl_device void osl_closure_microfacet_ggx_aniso_setup(
@ -345,7 +331,6 @@ ccl_device void osl_closure_microfacet_ggx_fresnel_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = zero_float3();

@ -383,7 +368,6 @@ ccl_device void osl_closure_microfacet_ggx_aniso_fresnel_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = closure->T;

@ -426,7 +410,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = zero_spectrum();
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = zero_float3();

@ -467,7 +450,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_glass_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = zero_spectrum();
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = zero_float3();

@ -508,7 +490,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_aniso_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = zero_spectrum();
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = closure->T;

@ -551,7 +532,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_fresnel_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = zero_float3();

@ -592,7 +572,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_glass_fresnel_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = zero_float3();

@ -633,7 +612,6 @@ ccl_device void osl_closure_microfacet_multi_ggx_aniso_fresnel_setup(
  bsdf->extra = extra;
  bsdf->extra->color = rgb_to_spectrum(closure->color);
  bsdf->extra->cspec0 = rgb_to_spectrum(closure->cspec0);
-  bsdf->extra->clearcoat = 0.0f;

  bsdf->T = closure->T;

@ -660,9 +638,9 @@ ccl_device void osl_closure_microfacet_beckmann_setup(
  }

  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
-  bsdf->alpha_x = closure->alpha_x;
+  bsdf->alpha_x = bsdf->alpha_y = closure->alpha_x;

-  sd->flag |= bsdf_microfacet_beckmann_isotropic_setup(bsdf);
+  sd->flag |= bsdf_microfacet_beckmann_setup(bsdf);
 }

 ccl_device void osl_closure_microfacet_beckmann_aniso_setup(
@ -865,27 +843,18 @@ ccl_device void osl_closure_principled_clearcoat_setup(
    float3 weight,
    ccl_private const PrincipledClearcoatClosure *closure)
 {
+  weight *= 0.25f * closure->clearcoat;
  ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
      sd, sizeof(MicrofacetBsdf), rgb_to_spectrum(weight));
  if (!bsdf) {
    return;
  }

-  MicrofacetExtra *extra = (MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra));
-  if (!extra) {
-    return;
-  }
-
  bsdf->N = ensure_valid_reflection(sd->Ng, sd->wi, closure->N);
  bsdf->alpha_x = closure->clearcoat_roughness;
  bsdf->alpha_y = closure->clearcoat_roughness;
  bsdf->ior = 1.5f;

-  bsdf->extra = extra;
-  bsdf->extra->color = zero_spectrum();
-  bsdf->extra->cspec0 = make_spectrum(0.04f);
-  bsdf->extra->clearcoat = closure->clearcoat;
-
  bsdf->T = zero_float3();

  sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
--- a/intern/cycles/kernel/svm/brick.h
+++ b/intern/cycles/kernel/svm/brick.h
@ -46,17 +46,8 @@ ccl_device_noinline_cpu float2 svm_brick(float3 p,
  float tint = saturatef((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) + bias));
  float min_dist = min(min(x, y), min(brick_width - x, row_height - y));

-  float mortar;
-  if (min_dist >= mortar_size) {
-    mortar = 0.0f;
-  }
-  else if (mortar_smooth == 0.0f) {
-    mortar = 1.0f;
-  }
-  else {
-    min_dist = 1.0f - min_dist / mortar_size;
-    mortar = (min_dist < mortar_smooth) ? smoothstepf(min_dist / mortar_smooth) : 1.0f;
-  }
+  min_dist = 1.0f - min_dist / mortar_size;
+  float mortar = smoothstepf(min_dist / mortar_smooth);

  return make_float2(tint, mortar);
 }
--- a/intern/cycles/kernel/svm/closure.h
+++ b/intern/cycles/kernel/svm/closure.h
@ -333,7 +333,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
            bsdf->extra->cspec0 = rgb_to_spectrum(
                (specular * 0.08f * tmp_col) * (1.0f - metallic) + base_color * metallic);
            bsdf->extra->color = rgb_to_spectrum(base_color);
-            bsdf->extra->clearcoat = 0.0f;

            /* setup bsdf */
            if (distribution == CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID ||
@ -383,7 +382,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,

                bsdf->extra->color = rgb_to_spectrum(base_color);
                bsdf->extra->cspec0 = rgb_to_spectrum(cspec0);
-                bsdf->extra->clearcoat = 0.0f;

                /* setup bsdf */
                sd->flag |= bsdf_microfacet_ggx_fresnel_setup(bsdf, sd);
@ -440,7 +438,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,

              bsdf->extra->color = rgb_to_spectrum(base_color);
              bsdf->extra->cspec0 = rgb_to_spectrum(cspec0);
-              bsdf->extra->clearcoat = 0.0f;

              /* setup bsdf */
              sd->flag |= bsdf_microfacet_multi_ggx_glass_fresnel_setup(bsdf, sd);
@ -455,30 +452,20 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
 #ifdef __CAUSTICS_TRICKS__
      if (kernel_data.integrator.caustics_reflective || (path_flag & PATH_RAY_DIFFUSE) == 0) {
 #endif
-        if (clearcoat > CLOSURE_WEIGHT_CUTOFF) {
-          ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
-              sd, sizeof(MicrofacetBsdf), weight);
-          ccl_private MicrofacetExtra *extra =
-              (bsdf != NULL) ?
-                  (ccl_private MicrofacetExtra *)closure_alloc_extra(sd, sizeof(MicrofacetExtra)) :
-                  NULL;
+        Spectrum clearcoat_weight = 0.25f * clearcoat * weight;
+        ccl_private MicrofacetBsdf *bsdf = (ccl_private MicrofacetBsdf *)bsdf_alloc(
+            sd, sizeof(MicrofacetBsdf), clearcoat_weight);

-          if (bsdf && extra) {
-            bsdf->N = clearcoat_normal;
-            bsdf->T = zero_float3();
-            bsdf->ior = 1.5f;
-            bsdf->extra = extra;
+        if (bsdf) {
+          bsdf->N = clearcoat_normal;
+          bsdf->T = zero_float3();
+          bsdf->ior = 1.5f;

-            bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
-            bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;
+          bsdf->alpha_x = clearcoat_roughness * clearcoat_roughness;
+          bsdf->alpha_y = clearcoat_roughness * clearcoat_roughness;

-            bsdf->extra->color = zero_spectrum();
-            bsdf->extra->cspec0 = make_spectrum(0.04f);
-            bsdf->extra->clearcoat = clearcoat;
-
-            /* setup bsdf */
-            sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
-          }
+          /* setup bsdf */
+          sd->flag |= bsdf_microfacet_ggx_clearcoat_setup(bsdf, sd);
        }
 #ifdef __CAUSTICS_TRICKS__
      }
@ -584,7 +571,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
        if (bsdf->extra) {
          bsdf->extra->color = rgb_to_spectrum(stack_load_float3(stack, data_node.w));
          bsdf->extra->cspec0 = zero_spectrum();
-          bsdf->extra->clearcoat = 0.0f;
          sd->flag |= bsdf_microfacet_multi_ggx_setup(bsdf);
        }
      }
@ -724,7 +710,6 @@ ccl_device_noinline int svm_node_closure_bsdf(KernelGlobals kg,
      kernel_assert(stack_valid(data_node.z));
      bsdf->extra->color = rgb_to_spectrum(stack_load_float3(stack, data_node.z));
      bsdf->extra->cspec0 = zero_spectrum();
-      bsdf->extra->clearcoat = 0.0f;

      /* setup bsdf */
      sd->flag |= bsdf_microfacet_multi_ggx_glass_setup(bsdf);
--- a/intern/cycles/kernel/svm/types.h
+++ b/intern/cycles/kernel/svm/types.h
@ -489,8 +489,7 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BSDF_MICROFACET_FRESNEL(type) \
  (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID || \
   type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID || \
-   type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID || \
-   type == CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID)
+   type == CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID)
 #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
 #define CLOSURE_IS_BSSRDF(type) \
  (type >= CLOSURE_BSSRDF_BURLEY_ID && type <= CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID)
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@ -74,7 +74,8 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME__

 /* TODO: solve internal compiler errors and enable light tree on HIP. */
-#ifdef __KERNEL_HIP__
+/* TODO: solve internal compiler perf issue and enable light tree on Metal/AMD. */
+#if defined(__KERNEL_HIP__) || defined(__KERNEL_METAL_AMD__)
 #  undef __LIGHT_TREE__
 #endif

@ -1290,12 +1291,14 @@ typedef struct KernelCurveSegment {
 static_assert_align(KernelCurveSegment, 8);

 typedef struct KernelSpotLight {
+  packed_float3 axis_u;
  float radius;
+  packed_float3 axis_v;
  float invarea;
-  float cos_half_spot_angle;
-  float spot_smooth;
  packed_float3 dir;
-  float pad;
+  float cos_half_spot_angle;
+  packed_float3 len;
+  float spot_smooth;
 } KernelSpotLight;

 /* PointLight is SpotLight with only radius and invarea being used. */
@ -1506,6 +1509,8 @@ typedef enum DeviceKernel : int {
  DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY,
+  DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS,
+  DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY,
  DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES,
  DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY,
--- a/intern/cycles/scene/light.cpp
+++ b/intern/cycles/scene/light.cpp
@ -1076,23 +1076,31 @@ void LightManager::device_update_lights(Device *device, DeviceScene *dscene, Sce
    else if (light->light_type == LIGHT_SPOT) {
      shader_id &= ~SHADER_AREA_LIGHT;

+      float3 len;
+      float3 axis_u = normalize_len(light->axisu, &len.x);
+      float3 axis_v = normalize_len(light->axisv, &len.y);
+      float3 dir = normalize_len(light->dir, &len.z);
+      if (len.z == 0.0f) {
+        dir = zero_float3();
+      }
+
      float radius = light->size;
      float invarea = (radius > 0.0f) ? 1.0f / (M_PI_F * radius * radius) : 1.0f;
      float cos_half_spot_angle = cosf(light->spot_angle * 0.5f);
      float spot_smooth = (1.0f - cos_half_spot_angle) * light->spot_smooth;
-      float3 dir = light->dir;
-
-      dir = safe_normalize(dir);

      if (light->use_mis && radius > 0.0f)
        shader_id |= SHADER_USE_MIS;

      klights[light_index].co = co;
+      klights[light_index].spot.axis_u = axis_u;
      klights[light_index].spot.radius = radius;
+      klights[light_index].spot.axis_v = axis_v;
      klights[light_index].spot.invarea = invarea;
-      klights[light_index].spot.cos_half_spot_angle = cos_half_spot_angle;
-      klights[light_index].spot.spot_smooth = spot_smooth;
      klights[light_index].spot.dir = dir;
+      klights[light_index].spot.cos_half_spot_angle = cos_half_spot_angle;
+      klights[light_index].spot.len = len;
+      klights[light_index].spot.spot_smooth = spot_smooth;
    }

    klights[light_index].shader_id = shader_id;
--- a/intern/cycles/scene/light_tree.cpp
+++ b/intern/cycles/scene/light_tree.cpp
@ -156,7 +156,13 @@ LightTreePrimitive::LightTreePrimitive(Scene *scene, int prim_id, int object_id)
    }
    else if (type == LIGHT_SPOT) {
      bcone.theta_o = 0;
-      bcone.theta_e = lamp->get_spot_angle() * 0.5f;
+
+      const float unscaled_theta_e = lamp->get_spot_angle() * 0.5f;
+      const float len_u = len(lamp->get_axisu());
+      const float len_v = len(lamp->get_axisv());
+      const float len_w = len(lamp->get_dir());
+
+      bcone.theta_e = fast_atanf(fast_tanf(unscaled_theta_e) * fmaxf(len_u, len_v) / len_w);

      /* Point and spot lights can emit light from any point within its radius. */
      const float3 radius = make_float3(size);
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@ -73,16 +73,55 @@ ccl_device_inline float atomic_add_and_fetch_float(volatile ccl_global float *_s
  return new_value.float_value;
 }

-#    define atomic_fetch_and_add_uint32(p, x) \
-      atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
-#    define atomic_fetch_and_sub_uint32(p, x) \
-      atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
-#    define atomic_fetch_and_inc_uint32(p) \
-      atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
-#    define atomic_fetch_and_dec_uint32(p) \
-      atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
-#    define atomic_fetch_and_or_uint32(p, x) \
-      atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(device T *p, int x)
+{
+  return atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(device T *p, int x)
+{
+  return atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(device T *p)
+{
+  return atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(device T *p)
+{
+  return atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(device T *p, int x)
+{
+  return atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_add_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_sub_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_inc_uint32(threadgroup T *p)
+{
+  return atomic_fetch_add_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_dec_uint32(threadgroup T *p)
+{
+  return atomic_fetch_sub_explicit((threadgroup atomic_uint *)p, 1, memory_order_relaxed);
+}
+
+template<class T> ccl_device_inline uint32_t atomic_fetch_and_or_uint32(threadgroup T *p, int x)
+{
+  return atomic_fetch_or_explicit((threadgroup atomic_uint *)p, x, memory_order_relaxed);
+}

 ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float *dest,
                                                      const float old_val,
--- a/intern/cycles/util/debug.cpp
+++ b/intern/cycles/util/debug.cpp
@ -69,6 +69,9 @@ void DebugFlags::Metal::reset()
 {
  if (getenv("CYCLES_METAL_ADAPTIVE_COMPILE") != NULL)
    adaptive_compile = true;
+
+  if (auto str = getenv("CYCLES_METAL_LOCAL_ATOMIC_SORT"))
+    use_local_atomic_sort = (atoi(str) != 0);
 }

 DebugFlags::OptiX::OptiX()
--- a/intern/cycles/util/debug.h
+++ b/intern/cycles/util/debug.h
@ -97,6 +97,9 @@ class DebugFlags {

    /* Whether adaptive feature based runtime compile is enabled or not. */
    bool adaptive_compile = false;
+
+    /* Whether local atomic sorting is enabled or not. */
+    bool use_local_atomic_sort = true;
  };

  /* Get instance of debug flags registry. */
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@ -483,6 +483,12 @@ ccl_device_inline float compatible_signf(float f)

 ccl_device_inline float smoothstepf(float f)
 {
+  if (f <= 0.0f) {
+    return 0.0f;
+  }
+  if (f >= 1.0f) {
+    return 1.0f;
+  }
  float ff = f * f;
  return (3.0f * ff - 2.0f * ff * f);
 }
--- a/intern/cycles/util/math_fast.h
+++ b/intern/cycles/util/math_fast.h
@ -74,7 +74,7 @@ ccl_device float fast_sinf(float x)
   *
   * Results on: [-2pi,2pi].
   *
-   * Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp,
+   * Examined 2173837240 values of sin: 0.00662760244 avg ULP diff, 2 max ULP,
   * 1.19209e-07 max error
   */
  int q = fast_rint(x * M_1_PI_F);
@ -256,11 +256,11 @@ ccl_device float fast_acosf(float x)
  /* clamp and crush denormals. */
  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
  /* Based on http://www.pouet.net/topic.php?which=9132&page=2
-   * 85% accurate (ulp 0)
+   * 85% accurate (ULP 0)
   * Examined 2130706434 values of acos:
-   *   15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
+   *   15.2000597 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // without "denormal crush"
   * Examined 2130706434 values of acos:
-   *   15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
+   *   15.2007108 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // with "denormal crush"
   */
  const float a = sqrtf(1.0f - m) *
                  (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
@ -270,9 +270,8 @@ ccl_device float fast_acosf(float x)
 ccl_device float fast_asinf(float x)
 {
  /* Based on acosf approximation above.
-   * Max error is 4.51133e-05 (ulps are higher because we are consistently off
-   * by a little amount).
-   */
+   * Max error is 4.51133e-05 (ULPS are higher because we are consistently off
+   * by a little amount). */
  const float f = fabsf(x);
  /* Clamp and crush denormals. */
  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
@ -290,9 +289,9 @@ ccl_device float fast_atanf(float x)
  const float t = s * s;
  /* http://mathforum.org/library/drmath/view/62672.html
   * Examined 4278190080 values of atan:
-   *   2.36864877 avg ulp diff, 302 max ulp, 6.55651e-06 max error      // (with  denormals)
+   *   2.36864877 avg ULP diff, 302 max ULP, 6.55651e-06 max error      // (with  denormals)
   * Examined 4278190080 values of atan:
-   *   171160502 avg ulp diff, 855638016 max ulp, 6.55651e-06 max error // (crush denormals)
+   *   171160502 avg ULP diff, 855638016 max ULP, 6.55651e-06 max error // (crush denormals)
   */
  float r = s * madd(0.43157974f, t, 1.0f) / madd(madd(0.05831938f, t, 0.76443945f), t, 1.0f);
  if (a > 1.0f) {
@ -343,8 +342,8 @@ ccl_device float fast_log2f(float x)
  int exponent = (int)(bits >> 23) - 127;
  float f = __uint_as_float((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
  /* Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]:
-   * 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error.
-   * ulp histogram:
+   * 0.0797524457 avg ULP diff, 3713596 max ULP, 7.62939e-06 max error.
+   * ULP histogram:
   *  0  = 97.46%
   *  1  =  2.29%
   *  2  =  0.11%
@ -363,7 +362,7 @@ ccl_device float fast_log2f(float x)
 ccl_device_inline float fast_logf(float x)
 {
  /* Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]:
-   * 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error.
+   * 0.313865375 avg ULP diff, 5148137 max ULP, 7.62939e-06 max error.
   */
  return fast_log2f(x) * M_LN2_F;
 }
@ -371,7 +370,7 @@ ccl_device_inline float fast_logf(float x)
 ccl_device_inline float fast_log10(float x)
 {
  /* Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]:
-   * 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error.
+   * 0.631237033 avg ULP diff, 4471615 max ULP, 3.8147e-06 max error.
   */
  return fast_log2f(x) * M_LN2_F / M_LN10_F;
 }
@ -392,12 +391,12 @@ ccl_device float fast_exp2f(float x)
  /* Range reduction. */
  int m = (int)x;
  x -= m;
-  x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ulps!). */
+  x = 1.0f - (1.0f - x); /* Crush denormals (does not affect max ULPS!). */
  /* 5th degree polynomial generated with sollya
-   * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff,
-   * 232 max ulp.
+   * Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ULP diff,
+   * 232 max ULP.
   *
-   * ulp histogram:
+   * ULP histogram:
   *  0  = 87.81%
   *  1  =  4.18%
   */
@ -415,7 +414,7 @@ ccl_device float fast_exp2f(float x)
 ccl_device_inline float fast_expf(float x)
 {
  /* Examined 2237485550 values of exp on [-87.3300018,87.3300018]:
-   * 2.6666452 avg ulp diff, 230 max ulp.
+   * 2.6666452 avg ULP diff, 230 max ULP.
   */
  return fast_exp2f(x / M_LN2_F);
 }
@ -454,7 +453,7 @@ ccl_device_inline float4 fast_expf4(float4 x)
 ccl_device_inline float fast_exp10(float x)
 {
  /* Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]:
-   * 2.71732409 avg ulp diff, 232 max ulp.
+   * 2.71732409 avg ULP diff, 232 max ULP.
   */
  return fast_exp2f(x * M_LN10_F / M_LN2_F);
 }
@ -475,7 +474,7 @@ ccl_device float fast_sinhf(float x)
  float a = fabsf(x);
  if (a > 1.0f) {
    /* Examined 53389559 values of sinh on [1,87.3300018]:
-     * 33.6886442 avg ulp diff, 178 max ulp. */
+     * 33.6886442 avg ULP diff, 178 max ULP. */
    float e = fast_expf(a);
    return copysignf(0.5f * e - 0.5f / e, x);
  }
@ -495,7 +494,7 @@ ccl_device float fast_sinhf(float x)
 ccl_device_inline float fast_coshf(float x)
 {
  /* Examined 2237485550 values of cosh on [-87.3300018,87.3300018]:
-   * 1.78256726 avg ulp diff, 178 max ulp.
+   * 1.78256726 avg ULP diff, 178 max ULP.
   */
  float e = fast_expf(fabsf(x));
  return 0.5f * e + 0.5f / e;
@ -506,7 +505,7 @@ ccl_device_inline float fast_tanhf(float x)
  /* Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]:
   * 3.12924e-06 max error.
   */
-  /* NOTE: ulp error is high because of sub-optimal handling around the origin. */
+  /* NOTE: ULP error is high because of sub-optimal handling around the origin. */
  float e = fast_expf(2.0f * fabsf(x));
  return copysignf(1.0f - 2.0f / (1.0f + e), x);
 }
@ -579,7 +578,7 @@ ccl_device_inline float fast_erfcf(float x)
 {
  /* Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error.
   *
-   * ulp histogram:
+   * ULP histogram:
   *
   *  0  = 80.30%
   */
--- a/intern/ghost/GHOST_C-api.h
+++ b/intern/ghost/GHOST_C-api.h
@ -1201,7 +1201,7 @@ void GHOST_GetVulkanHandles(GHOST_ContextHandle context,
                            void *r_instance,
                            void *r_physical_device,
                            void *r_device,
-                            uint32_t *r_graphic_queue_familly);
+                            uint32_t *r_graphic_queue_family);

 /**
 * Return VULKAN back-buffer resources handles for the given window.
--- a/intern/ghost/intern/GHOST_C-api.cpp
+++ b/intern/ghost/intern/GHOST_C-api.cpp
@ -1203,10 +1203,10 @@ void GHOST_GetVulkanHandles(GHOST_ContextHandle contexthandle,
                            void *r_instance,
                            void *r_physical_device,
                            void *r_device,
-                            uint32_t *r_graphic_queue_familly)
+                            uint32_t *r_graphic_queue_family)
 {
  GHOST_IContext *context = (GHOST_IContext *)contexthandle;
-  context->getVulkanHandles(r_instance, r_physical_device, r_device, r_graphic_queue_familly);
+  context->getVulkanHandles(r_instance, r_physical_device, r_device, r_graphic_queue_family);
 }

 void GHOST_GetVulkanBackbuffer(GHOST_WindowHandle windowhandle,
--- a/intern/ghost/intern/GHOST_Context.h
+++ b/intern/ghost/intern/GHOST_Context.h
@ -142,7 +142,7 @@ class GHOST_Context : public GHOST_IContext {
  virtual GHOST_TSuccess getVulkanHandles(void * /*r_instance*/,
                                          void * /*r_physical_device*/,
                                          void * /*r_device*/,
-                                          uint32_t * /*r_graphic_queue_familly*/) override
+                                          uint32_t * /*r_graphic_queue_family*/) override
  {
    return GHOST_kFailure;
  };
--- a/intern/ghost/intern/GHOST_ContextVK.cpp
+++ b/intern/ghost/intern/GHOST_ContextVK.cpp
@ -311,12 +311,12 @@ GHOST_TSuccess GHOST_ContextVK::getVulkanBackbuffer(void *image,
 GHOST_TSuccess GHOST_ContextVK::getVulkanHandles(void *r_instance,
                                                 void *r_physical_device,
                                                 void *r_device,
-                                                 uint32_t *r_graphic_queue_familly)
+                                                 uint32_t *r_graphic_queue_family)
 {
  *((VkInstance *)r_instance) = m_instance;
  *((VkPhysicalDevice *)r_physical_device) = m_physical_device;
  *((VkDevice *)r_device) = m_device;
-  *r_graphic_queue_familly = m_queue_family_graphic;
+  *r_graphic_queue_family = m_queue_family_graphic;

  return GHOST_kSuccess;
 }
@ -520,13 +520,14 @@ static GHOST_TSuccess getGraphicQueueFamily(VkPhysicalDevice device, uint32_t *r

  *r_queue_index = 0;
  for (const auto &queue_family : queue_families) {
-    if (queue_family.queueFlags & VK_QUEUE_GRAPHICS_BIT) {
+    if ((queue_family.queueFlags & VK_QUEUE_GRAPHICS_BIT) &&
+        (queue_family.queueFlags & VK_QUEUE_COMPUTE_BIT)) {
      return GHOST_kSuccess;
    }
    (*r_queue_index)++;
  }

-  fprintf(stderr, "Couldn't find any Graphic queue familly on selected device\n");
+  fprintf(stderr, "Couldn't find any Graphic queue family on selected device\n");
  return GHOST_kFailure;
 }

@ -551,7 +552,7 @@ static GHOST_TSuccess getPresetQueueFamily(VkPhysicalDevice device,
    (*r_queue_index)++;
  }

-  fprintf(stderr, "Couldn't find any Present queue familly on selected device\n");
+  fprintf(stderr, "Couldn't find any Present queue family on selected device\n");
  return GHOST_kFailure;
 }

--- a/intern/ghost/intern/GHOST_ContextVK.h
+++ b/intern/ghost/intern/GHOST_ContextVK.h
@ -113,7 +113,7 @@ class GHOST_ContextVK : public GHOST_Context {
  GHOST_TSuccess getVulkanHandles(void *r_instance,
                                  void *r_physical_device,
                                  void *r_device,
-                                  uint32_t *r_graphic_queue_familly);
+                                  uint32_t *r_graphic_queue_family);
  /**
   * Gets the Vulkan framebuffer related resource handles associated with the Vulkan context.
   * Needs to be called after each swap events as the framebuffer will change.
--- a/intern/ghost/intern/GHOST_SystemWayland.cpp
+++ b/intern/ghost/intern/GHOST_SystemWayland.cpp
@ -82,6 +82,8 @@
 #include "CLG_log.h"

 #ifdef USE_EVENT_BACKGROUND_THREAD
+#  include "GHOST_TimerTask.h"
+
 #  include <pthread.h>
 #endif

@ -768,7 +770,12 @@ struct GWL_Seat {
    int32_t rate = 0;
    /** Time (milliseconds) after which to start repeating keys. */
    int32_t delay = 0;
-    /** Timer for key repeats. */
+    /**
+     * Timer for key repeats.
+     *
+     * \note For as long as #USE_EVENT_BACKGROUND_THREAD is defined, any access to this
+     * (including null checks, must lock `timer_mutex` first.
+     */
    GHOST_ITimerTask *timer = nullptr;
  } key_repeat;

@ -832,6 +839,42 @@ static bool gwl_seat_key_depressed_suppress_warning(const GWL_Seat *seat)
  return suppress_warning;
 }

+/**
+ * \note Caller must lock `timer_mutex`.
+ */
+static void gwl_seat_key_repeat_timer_add(GWL_Seat *seat,
+                                          GHOST_TimerProcPtr key_repeat_fn,
+                                          GHOST_TUserDataPtr payload,
+                                          const bool use_delay)
+{
+  GHOST_SystemWayland *system = seat->system;
+  const uint64_t time_step = 1000 / seat->key_repeat.rate;
+  const uint64_t time_start = use_delay ? seat->key_repeat.delay : time_step;
+#ifdef USE_EVENT_BACKGROUND_THREAD
+  GHOST_TimerTask *timer = new GHOST_TimerTask(
+      system->getMilliSeconds() + time_start, time_step, key_repeat_fn, payload);
+  seat->key_repeat.timer = timer;
+  system->ghost_timer_manager()->addTimer(timer);
+#else
+  seat->key_repeat.timer = system->installTimer(time_start, time_step, key_repeat_fn, payload);
+#endif
+}
+
+/**
+ * \note The caller must lock `timer_mutex`.
+ */
+static void gwl_seat_key_repeat_timer_remove(GWL_Seat *seat)
+{
+  GHOST_SystemWayland *system = seat->system;
+#ifdef USE_EVENT_BACKGROUND_THREAD
+  system->ghost_timer_manager()->removeTimer(
+      static_cast<GHOST_TimerTask *>(seat->key_repeat.timer));
+#else
+  system->removeTimer(seat->key_repeat.timer);
+#endif
+  seat->key_repeat.timer = nullptr;
+}
+
 /** \} */

 /* -------------------------------------------------------------------- */
@ -906,6 +949,16 @@ struct GWL_Display {
  /** Guard against multiple threads accessing `events_pending` at once. */
  std::mutex events_pending_mutex;

+  /**
+   * A separate timer queue, needed so the WAYLAND thread can lock access.
+   * Using the system's #GHOST_Sysem::getTimerManager is not thread safe because
+   * access to the timer outside of WAYLAND specific logic will not lock.
+   *
+   * Needed because #GHOST_System::dispatchEvents fires timers
+   * outside of WAYLAND (without locking the `timer_mutex`).
+   */
+  GHOST_TimerManager *ghost_timer_manager;
+
 #endif /* USE_EVENT_BACKGROUND_THREAD */
 };

@ -959,6 +1012,11 @@ static void gwl_display_destroy(GWL_Display *display)
    gwl_display_event_thread_destroy(display);
    display->system->server_mutex->unlock();
  }
+
+  /* Important to remove after the seats which may have key repeat timers active. */
+  delete display->ghost_timer_manager;
+  display->ghost_timer_manager = nullptr;
+
 #endif /* USE_EVENT_BACKGROUND_THREAD */

  if (display->wl_display) {
@ -3718,9 +3776,14 @@ static void keyboard_handle_leave(void *data,
  GWL_Seat *seat = static_cast<GWL_Seat *>(data);
  seat->keyboard.wl_surface_window = nullptr;

-  /* Losing focus must stop repeating text. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_cancel(seat);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    /* Losing focus must stop repeating text. */
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_cancel(seat);
+    }
  }

 #ifdef USE_GNOME_KEYBOARD_SUPPRESS_WARNING
@ -3780,36 +3843,32 @@ static xkb_keysym_t xkb_state_key_get_one_sym_without_modifiers(
  return sym;
 }

+/**
+ * \note Caller must lock `timer_mutex`.
+ */
 static void keyboard_handle_key_repeat_cancel(GWL_Seat *seat)
 {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
  GHOST_ASSERT(seat->key_repeat.timer != nullptr, "Caller much check for timer");
  delete static_cast<GWL_KeyRepeatPlayload *>(seat->key_repeat.timer->getUserData());
-  seat->system->removeTimer(seat->key_repeat.timer);
-  seat->key_repeat.timer = nullptr;
+
+  gwl_seat_key_repeat_timer_remove(seat);
 }

 /**
 * Restart the key-repeat timer.
 * \param use_delay: When false, use the interval
 * (prevents pause when the setting changes while the key is held).
+ *
+ * \note Caller must lock `timer_mutex`.
 */
 static void keyboard_handle_key_repeat_reset(GWL_Seat *seat, const bool use_delay)
 {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
  GHOST_ASSERT(seat->key_repeat.timer != nullptr, "Caller much check for timer");
-  GHOST_SystemWayland *system = seat->system;
-  GHOST_ITimerTask *timer = seat->key_repeat.timer;
-  GHOST_TimerProcPtr key_repeat_fn = timer->getTimerProc();
+  GHOST_TimerProcPtr key_repeat_fn = seat->key_repeat.timer->getTimerProc();
  GHOST_TUserDataPtr payload = seat->key_repeat.timer->getUserData();
-  seat->system->removeTimer(seat->key_repeat.timer);
-  const uint64_t time_step = 1000 / seat->key_repeat.rate;
-  const uint64_t time_start = use_delay ? seat->key_repeat.delay : time_step;
-  seat->key_repeat.timer = system->installTimer(time_start, time_step, key_repeat_fn, payload);
+
+  gwl_seat_key_repeat_timer_remove(seat);
+  gwl_seat_key_repeat_timer_add(seat, key_repeat_fn, payload, use_delay);
 }

 static void keyboard_handle_key(void *data,
@ -3848,6 +3907,11 @@ static void keyboard_handle_key(void *data,
      break;
  }

+#ifdef USE_EVENT_BACKGROUND_THREAD
+  /* Any access to `seat->key_repeat.timer` must lock. */
+  std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+
  struct GWL_KeyRepeatPlayload *key_repeat_payload = nullptr;

  /* Delete previous timer. */
@ -3886,23 +3950,14 @@ static void keyboard_handle_key(void *data,
        break;
      }
      case RESET: {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-        std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
        /* The payload will be added again. */
-        seat->system->removeTimer(seat->key_repeat.timer);
-        seat->key_repeat.timer = nullptr;
+        gwl_seat_key_repeat_timer_remove(seat);
        break;
      }
      case CANCEL: {
-#ifdef USE_EVENT_BACKGROUND_THREAD
-        std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
-#endif
        delete key_repeat_payload;
        key_repeat_payload = nullptr;
-
-        seat->system->removeTimer(seat->key_repeat.timer);
-        seat->key_repeat.timer = nullptr;
+        gwl_seat_key_repeat_timer_remove(seat);
        break;
      }
    }
@ -3956,8 +4011,8 @@ static void keyboard_handle_key(void *data,
                                                           utf8_buf));
      }
    };
-    seat->key_repeat.timer = seat->system->installTimer(
-        seat->key_repeat.delay, 1000 / seat->key_repeat.rate, key_repeat_fn, key_repeat_payload);
+
+    gwl_seat_key_repeat_timer_add(seat, key_repeat_fn, key_repeat_payload, true);
  }
 }

@ -3982,8 +4037,13 @@ static void keyboard_handle_modifiers(void *data,

  /* A modifier changed so reset the timer,
   * see comment in #keyboard_handle_key regarding this behavior. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_reset(seat, true);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_reset(seat, true);
+    }
  }

 #ifdef USE_GNOME_KEYBOARD_SUPPRESS_WARNING
@ -4002,9 +4062,14 @@ static void keyboard_repeat_handle_info(void *data,
  seat->key_repeat.rate = rate;
  seat->key_repeat.delay = delay;

-  /* Unlikely possible this setting changes while repeating. */
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_reset(seat, false);
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    /* Unlikely possible this setting changes while repeating. */
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_reset(seat, false);
+    }
  }
 }

@ -4275,8 +4340,14 @@ static void gwl_seat_capability_keyboard_disable(GWL_Seat *seat)
  if (!seat->wl_keyboard) {
    return;
  }
-  if (seat->key_repeat.timer) {
-    keyboard_handle_key_repeat_cancel(seat);
+
+  {
+#ifdef USE_EVENT_BACKGROUND_THREAD
+    std::lock_guard lock_timer_guard{*seat->system->timer_mutex};
+#endif
+    if (seat->key_repeat.timer) {
+      keyboard_handle_key_repeat_cancel(seat);
+    }
  }
  wl_keyboard_destroy(seat->wl_keyboard);
  seat->wl_keyboard = nullptr;
@ -5411,6 +5482,8 @@ GHOST_SystemWayland::GHOST_SystemWayland(bool background)

 #ifdef USE_EVENT_BACKGROUND_THREAD
  gwl_display_event_thread_create(display_);
+
+  display_->ghost_timer_manager = new GHOST_TimerManager();
 #endif
 }

@ -5491,10 +5564,16 @@ bool GHOST_SystemWayland::processEvents(bool waitForEvent)
 #endif /* USE_EVENT_BACKGROUND_THREAD */

  {
+    const uint64_t now = getMilliSeconds();
 #ifdef USE_EVENT_BACKGROUND_THREAD
-    std::lock_guard lock_timer_guard{*display_->system->timer_mutex};
+    {
+      std::lock_guard lock_timer_guard{*display_->system->timer_mutex};
+      if (ghost_timer_manager()->fireTimers(now)) {
+        any_processed = true;
+      }
+    }
 #endif
-    if (getTimerManager()->fireTimers(getMilliSeconds())) {
+    if (getTimerManager()->fireTimers(now)) {
      any_processed = true;
    }
  }
@ -6717,6 +6796,13 @@ struct wl_shm *GHOST_SystemWayland::wl_shm() const
  return display_->wl_shm;
 }

+#ifdef USE_EVENT_BACKGROUND_THREAD
+GHOST_TimerManager *GHOST_SystemWayland::ghost_timer_manager()
+{
+  return display_->ghost_timer_manager;
+}
+#endif
+
 /** \} */

 /* -------------------------------------------------------------------- */
--- a/intern/ghost/intern/GHOST_SystemWayland.h
+++ b/intern/ghost/intern/GHOST_SystemWayland.h
@ -165,6 +165,16 @@ class GHOST_SystemWayland : public GHOST_System {

  bool cursor_grab_use_software_display_get(const GHOST_TGrabCursorMode mode);

+#ifdef USE_EVENT_BACKGROUND_THREAD
+  /**
+   * Return a separate WAYLAND local timer manager to #GHOST_System::getTimerManager
+   * Manipulation & access must lock with #GHOST_WaylandSystem::server_mutex.
+   *
+   * See #GWL_Display::ghost_timer_manager doc-string for details on why this is needed.
+   */
+  GHOST_TimerManager *ghost_timer_manager();
+#endif
+
  /* WAYLAND direct-data access. */

  struct wl_display *wl_display();
@ -233,7 +243,14 @@ class GHOST_SystemWayland : public GHOST_System {
   * from running at the same time. */
  std::mutex *server_mutex = nullptr;

-  /** Threads must lock this before manipulating timers. */
+  /**
+   * Threads must lock this before manipulating #GWL_Display::ghost_timer_manager.
+   *
+   * \note Using a separate lock to `server_mutex` is necessary because the
+   * server lock is already held when calling `ghost_wl_display_event_pump`.
+   * If manipulating the timer used the `server_mutex`, event pump can indirectly
+   * handle key up/down events which would lock `server_mutex` causing a dead-lock.
+   */
  std::mutex *timer_mutex = nullptr;

  std::thread::id main_thread_id;
--- a/intern/ghost/intern/GHOST_SystemWin32.h
+++ b/intern/ghost/intern/GHOST_SystemWin32.h
@ -282,7 +282,7 @@ class GHOST_SystemWin32 : public GHOST_System {
  GHOST_TSuccess exit();

  /**
-   * Converts raw WIN32 key codes from the wndproc to GHOST keys.
+   * Converts raw WIN32 key codes from the `wndproc` to GHOST keys.
   * \param vKey: The virtual key from #hardKey.
   * \param ScanCode: The ScanCode of pressed key (similar to PS/2 Set 1).
   * \param extend: Flag if key is not primly (left or right).
@ -291,7 +291,7 @@ class GHOST_SystemWin32 : public GHOST_System {
  GHOST_TKey convertKey(short vKey, short ScanCode, short extend) const;

  /**
-   * Catches raw WIN32 key codes from WM_INPUT in the wndproc.
+   * Catches raw WIN32 key codes from WM_INPUT in the `wndproc`.
   * \param raw: RawInput structure with detailed info about the key event.
   * \param r_key_down: Set true when the key is pressed, otherwise false.
   * \return The GHOST key (GHOST_kKeyUnknown if no match).
@ -319,8 +319,8 @@ class GHOST_SystemWin32 : public GHOST_System {
   * Creates tablet events from pointer events.
   * \param type: The type of pointer event.
   * \param window: The window receiving the event (the active window).
-   * \param wParam: The wParam from the wndproc.
-   * \param lParam: The lParam from the wndproc.
+   * \param wParam: The wParam from the `wndproc`.
+   * \param lParam: The lParam from the `wndproc`.
   * \param eventhandled: True if the method handled the event.
   */
  static void processPointerEvent(
@ -337,8 +337,8 @@ class GHOST_SystemWin32 : public GHOST_System {
  /**
   * Handles a mouse wheel event.
   * \param window: The window receiving the event (the active window).
-   * \param wParam: The wParam from the wndproc.
-   * \param lParam: The lParam from the wndproc.
+   * \param wParam: The wParam from the `wndproc`.
+   * \param lParam: The lParam from the `wndproc`.
   */
  static void processWheelEvent(GHOST_WindowWin32 *window, WPARAM wParam, LPARAM lParam);

--- a/intern/ghost/intern/GHOST_SystemX11.cpp
+++ b/intern/ghost/intern/GHOST_SystemX11.cpp
@ -2015,8 +2015,8 @@ void GHOST_SystemX11::getClipboard_xcout(
        return;
      }

-      /* if it's not incr, and not format == 8, then there's
-       * nothing in the selection (that xclip understands, anyway) */
+      /* If it's not INCR, and not `format == 8`, then there's
+       * nothing in the selection (that `xclip` understands, anyway). */

      if (pty_format != 8) {
        *context = XCLIB_XCOUT_NONE;
--- a/intern/libmv/bundle.sh
+++ b/intern/libmv/bundle.sh
@ -8,9 +8,9 @@ else
  exit 1
 fi

-BRANCH="master"
+BRANCH="main"

-# repo="git://git.blender.org/libmv.git"
+# repo="https://projects.blender.org/blender/libmv.git"
 repo="/home/sergey/Developer/libmv"
 tmp=`mktemp -d`

--- a/intern/mikktspace/mikk_util.hh
+++ b/intern/mikktspace/mikk_util.hh
@ -40,11 +40,11 @@ inline float fast_acosf(float x)
  /* clamp and crush denormals. */
  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
  /* Based on http://www.pouet.net/topic.php?which=9132&page=2
-   * 85% accurate (ulp 0)
+   * 85% accurate (ULP 0)
   * Examined 2130706434 values of acos:
-   *   15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
+   *   15.2000597 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // without "denormal crush"
   * Examined 2130706434 values of acos:
-   *   15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
+   *   15.2007108 avg ULP diff, 4492 max ULP, 4.51803e-05 max error // with "denormal crush"
   */
  const float a = sqrtf(1.0f - m) *
                  (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
--- a/intern/opensubdiv/internal/evaluator/eval_output.h
+++ b/intern/opensubdiv/internal/evaluator/eval_output.h
@ -89,7 +89,7 @@ class EvalOutputAPI::EvalOutput {
  // The following interfaces are dependant on the actual evaluator type (CPU, OpenGL, etc.) which
  // have slightly different APIs to access patch arrays, as well as different types for their
  // data structure. They need to be overridden in the specific instances of the EvalOutput derived
-  // classes if needed, while the interfaces above are overriden through VolatileEvalOutput.
+  // classes if needed, while the interfaces above are overridden through VolatileEvalOutput.

  virtual void fillPatchArraysBuffer(OpenSubdiv_Buffer * /*patch_arrays_buffer*/)
  {
--- a/release/lts/README.md
+++ b/release/lts/README.md
@ -1,41 +1,18 @@
-This folder contains several scripts to smoothen the Blender LTS releases.
+This folder contains a script to generate release notes and download URLs
+for Blender LTS releases.

-create_download_urls.py
-=======================
+Ensure required Python modules are installed before running:

-This python script is used to generate the download urls which we can
-copy-paste directly into the CMS of www.blender.org.
+    pip3 install -r ./requirements.txt

-Usage: create_download_urls.py --version 2.83.7
+Then run for example:

-Arguments:
-  --version VERSION  Version string in the form of {major}.{minor}.{build}
-                     (eg 2.83.7)
+    ./create_release_notes.py --version 3.3.2 --format=html

-The resulting html will be printed to the console.
+Available arguments:

-create_release_notes.py
-=======================
-
-This python script is used to generate the release notes which we can
-copy-paste directly into the CMS of www.blender.org and stores.
-
-Usage: ./create_release_notes.py --task=T77348 --version=2.83.7
-
-Arguments:
-  --version VERSION  Version string in the form of {major}.{minor}.{build}
-                     (e.g. 2.83.7)
-  --task TASK        Phabricator ticket that is contains the release notes
-                     information (e.g. T77348)
-  --format FORMAT    Format the result in `text`, `steam`, `wiki` or `html`
-
-Requirements
-============
-
-* Python 3.8 or later
-* Python phabricator client version 0.7.0
-  https://pypi.org/project/phabricator/
-
-For convenience the python modules can be installed using pip
-
-    pip3 install -r ./requirements.txt
+    --version VERSION  Version string in the form of {major}.{minor}.{build}
+                       (e.g. 3.3.2)
+    --issue ISSUE      Gitea issue that is contains the release notes
+                       information (e.g. #77348)
+    --format FORMAT    Format the result in `text`, `steam`, `wiki` or `html`
--- a/release/lts/create_release_notes.py
+++ b/release/lts/create_release_notes.py
@ -1,169 +1,46 @@
+#!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0-or-later

-#!/usr/bin/env python3
-
 import argparse
-import phabricator

+import lts_issue
+import lts_download

-DESCRIPTION = ("This python script is used to generate the release notes "
-               "which we can copy-paste directly into the CMS of "
+DESCRIPTION = ("This python script is used to generate the release notes and "
+               "download URLs which we can copy-paste directly into the CMS of "
               "www.blender.org and stores.")
-USAGE = "./create_release_notes.py --task=T77348 --version=2.83.7"

+# Parse arguments
+parser = argparse.ArgumentParser(description=DESCRIPTION)
+parser.add_argument(
+    "--version",
+    required=True,
+    help="Version string in the form of {major}.{minor}.{patch} (e.g. 3.3.2)")
+parser.add_argument(
+    "--issue",
+    help="Task that is contains the release notes information (e.g. #77348)")
+parser.add_argument(
+    "--format",
+    help="Format the result in `text`, `steam`, `wiki` or `html`",
+    default="text")
+args = parser.parse_args()

-class ReleaseLogLine:
-    """
-    Class containing the information of a single line of the release log
+# Determine issue number
+version = args.version
+issue = args.issue
+if not issue:
+    if version.startswith("2.83."):
+        issue = "#77348"
+    elif version.startswith("2.93."):
+        issue = "#88449"
+    elif version.startswith("3.3."):
+        issue = "#100749"
+    else:
+        raise ValueError("Specify --issue or update script to include issue number for this version")

-    Instance attributes:
+# Print
+if args.format == "html":
+    lts_download.print_urls(version=version)
+    print("")

-    * line: (str) the original line used to create this log line
-    * task_id: (int or None) the extracted task id associated with this log
-               line. Can be None if the log line isn't associated with a task.
-    * commit_id: (str or None) the extracted commit id associated with this log
-               line. Only filled when no `task_id` could be found.
-    * ref: (str) `task_id` or `commit_id` of this line, including `T` for tasks
-            or `D` for diffs.
-    * title: (str) title of this log line. When constructed this attribute is
-            an empty string. The called needs to retrieve the title from the
-            backend.
-    * url: (str) url of the ticket task or commit.
-    """
-
-    def __init__(self, line: str):
-        self.line = line
-        items = line.split("|")
-        self.task_id = None
-        self.commit_id = None
-        try:
-            task_id = int(items[1].strip()[1:])
-            self.task_id = task_id
-            self.ref = f"T{self.task_id}"
-        except ValueError:
-            # no task
-            commit_string = items[3].strip()
-            commits = commit_string.split(",")
-            commit_id = commits[0]
-            commit_id = commit_id.replace("{", "").replace("}", "")
-            if not commit_id.startswith("rB"):
-                commit_id = f"rB{commit_id}"
-            self.commit_id = commit_id
-
-            self.ref = f"{self.commit_id}"
-
-        self.title = ""
-        self.url = f"https://developer.blender.org/{self.ref}"
-
-    def __format_as_html(self) -> str:
-        return f"  <li>{self.title} [<a href=\"{self.url}\">{self.ref}</a>]</li>"
-
-    def __format_as_text(self) -> str:
-        return f"* {self.title} [{self.ref}]"
-
-    def __format_as_steam(self) -> str:
-        return f"* {self.title} ([url={self.url}]{self.ref}[/url])"
-
-    def __format_as_wiki(self) -> str:
-        if self.task_id:
-            return f"* {self.title} [{{{{BugReport|{self.task_id}}}}}]"
-        else:
-            return f"* {self.title} [{{{{GitCommit|{self.commit_id[2:]}}}}}]"
-
-    def format(self, format: str) -> str:
-        """
-        Format this line
-
-        :attr format: the desired format. Possible values are 'text', 'steam' or 'html'
-        :type string:
-        """
-        if format == 'html':
-            return self.__format_as_html()
-        elif format == 'steam':
-            return self.__format_as_steam()
-        elif format == 'wiki':
-            return self.__format_as_wiki()
-        else:
-            return self.__format_as_text()
-
-
-def format_title(title: str) -> str:
-    title = title.strip()
-    if not title.endswith("."):
-        title = title + "."
-    return title
-
-
-def extract_release_notes(version: str, task_id: int):
-    """
-    Extract all release notes logs
-
-    # Process
-
-    1. Retrieval of description of the given `task_id`.
-    2. Find rows for the given `version` and convert to `ReleaseLogLine`.
-    3. based on the associated task or commit retrieves the title of the log
-       line.
-    """
-    phab = phabricator.Phabricator()
-    phab.update_interfaces()
-    task = phab.maniphest.info(task_id=task_id)
-    description = task["description"]
-    lines = description.split("\n")
-    start_index = lines.index(f"## Blender {version} ##")
-    lines = lines[start_index + 1:]
-    for line in lines:
-        if not line.strip():
-            continue
-        if line.startswith("| **Report**"):
-            continue
-        if line.startswith("## Blender"):
-            break
-
-        log_line = ReleaseLogLine(line)
-        if log_line.task_id:
-            issue_task = phab.maniphest.info(task_id=log_line.task_id)
-            log_line.title = format_title(issue_task.title)
-            yield log_line
-        elif log_line.commit_id:
-            commits = phab.diffusion.commit.search(constraints={"identifiers": [log_line.commit_id]})
-            commit = commits.data[0]
-            commit_message = commit['fields']['message']
-            commit_title = commit_message.split("\n")[0]
-            log_line.title = format_title(commit_title)
-            yield log_line
-
-
-def print_release_notes(version: str, format: str, task_id: int):
-    """
-        Generate and print the release notes to the console.
-    """
-    if format == 'html':
-        print("<ul>")
-    if format == 'steam':
-        print("[ul]")
-    for log_item in extract_release_notes(version=version, task_id=task_id):
-        print(log_item.format(format=format))
-    if format == 'html':
-        print("</ul>")
-    if format == 'steam':
-        print("[/ul]")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE)
-    parser.add_argument(
-        "--version",
-        required=True,
-        help="Version string in the form of {major}.{minor}.{build} (e.g. 2.83.7)")
-    parser.add_argument(
-        "--task",
-        required=True,
-        help="Phabricator ticket that is contains the release notes information (e.g. T77348)")
-    parser.add_argument(
-        "--format",
-        help="Format the result in `text`, `steam`, `wiki` or `html`",
-        default="text")
-    args = parser.parse_args()
-
-    print_release_notes(version=args.version, format=args.format, task_id=int(args.task[1:]))
+lts_issue.print_notes(version=version, format=args.format, issue=issue)
--- a/release/lts/create_download_urls.py
+++ b/release/lts/create_download_urls.py
@ -1,14 +1,9 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: GPL-2.0-or-later

-import argparse
 import datetime


-DESCRIPTION = ("This python script is used to generate the download urls "
-               "which we can copy-paste directly into the CMS of "
-               "www.blender.org")
-USAGE = "create_download_urls --version=2.83.7"
 # Used date format: "September 30, 2020"
 DATE_FORMAT = "%B %d, %Y"

@ -62,19 +57,8 @@ def generate_html(version: Version) -> str:
    return "\n".join(lines)


-def print_download_urls(version: Version):
+def print_urls(version: str):
    """
    Generate the download urls and print them to the console.
    """
-    print(generate_html(version))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE)
-    parser.add_argument("--version",
-                        required=True,
-                        help=("Version string in the form of {major}.{minor}."
-                              "{build} (eg 2.83.7)"))
-    args = parser.parse_args()
-
-    print_download_urls(version=Version(args.version))
+    print(generate_html(Version(version)))
--- a/release/lts/lts_issue.py
+++ b/release/lts/lts_issue.py
@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+import requests
+
+
+class ReleaseLogLine:
+    """
+    Class containing the information of a single line of the release log
+
+    Instance attributes:
+
+    * line: (str) the original line used to create this log line
+    * issue_id: (int or None) the extracted issue id associated with this log
+               line. Can be None if the log line isn't associated with a issue.
+    * commit_id: (str or None) the extracted commit id associated with this log
+               line. Only filled when no `issue_id` could be found.
+    * ref: (str) `issue_id` or `commit_id` of this line, including `T` for issues
+            or `D` for diffs.
+    * title: (str) title of this log line. When constructed this attribute is
+            an empty string. The called needs to retrieve the title from the
+            backend.
+    * url: (str) url of the ticket issue or commit.
+    """
+
+    def __init__(self, line: str):
+        self.line = line
+        items = line.split("|")
+        self.issue_id = None
+        self.issue_repo = None
+        self.commit_id = None
+        self.commit_repo = None
+        base_url = "https://projects.blender.org"
+        try:
+            issue_tokens = items[1].strip().split("#")
+            if len(issue_tokens[0]) > 0:
+                self.issue_repo = issue_tokens[0]
+                self.issue_id = issue_tokens[1]
+            else:
+                self.issue_repo = "blender/blender"
+                self.issue_id = issue_tokens[1]
+
+            self.ref = f"#{self.issue_id}"
+            self.url = f"{base_url}/{self.issue_repo}/issues/{self.issue_id}"
+        except IndexError:
+            # no issue
+            commit_string = items[3].strip()
+            commit_string = commit_string.split(",")[0]
+            commit_string = commit_string.split("]")[0]
+            commit_string = commit_string.replace("[", "")
+
+            commit_tokens = commit_string.split("@")
+            if len(commit_tokens) > 1:
+                self.commit_repo = commit_tokens[0]
+                self.commit_id = commit_tokens[1]
+            else:
+                self.commit_repo = "blender/blender"
+                self.commit_id = commit_tokens[0]
+
+            self.ref = f"{self.commit_id}"
+            self.url = f"{base_url}/{self.commit_repo}/commit/{self.commit_id}"
+
+        self.title = ""
+
+    def __format_as_html(self) -> str:
+        return f"  <li>{self.title} [<a href=\"{self.url}\">{self.ref}</a>]</li>"
+
+    def __format_as_text(self) -> str:
+        return f"* {self.title} [{self.ref}]"
+
+    def __format_as_steam(self) -> str:
+        return f"* {self.title} ([url={self.url}]{self.ref}[/url])"
+
+    def __format_as_wiki(self) -> str:
+        if self.issue_id:
+            return f"* {self.title} [{{{{BugReport|{self.issue_id}}}}}]"
+        else:
+            return f"* {self.title} [{{{{GitCommit|{self.commit_id[2:]}}}}}]"
+
+    def format(self, format: str) -> str:
+        """
+        Format this line
+
+        :attr format: the desired format. Possible values are 'text', 'steam' or 'html'
+        :type string:
+        """
+        if format == 'html':
+            return self.__format_as_html()
+        elif format == 'steam':
+            return self.__format_as_steam()
+        elif format == 'wiki':
+            return self.__format_as_wiki()
+        else:
+            return self.__format_as_text()
+
+
+def format_title(title: str) -> str:
+    title = title.strip()
+    if not title.endswith("."):
+        title = title + "."
+    return title
+
+
+def extract_release_notes(version: str, issue: str):
+    """
+    Extract all release notes logs
+
+    # Process
+
+    1. Retrieval of description of the given `issue_id`.
+    2. Find rows for the given `version` and convert to `ReleaseLogLine`.
+    3. based on the associated issue or commit retrieves the title of the log
+       line.
+    """
+    base_url = "https://projects.blender.org/api/v1/repos"
+    issues_url = base_url + "/blender/blender/issues/"
+    headers = {'accept': 'application/json'}
+
+    response = requests.get(issues_url + issue[1:], headers=headers)
+    description = response.json()["body"]
+
+    lines = description.split("\n")
+    start_index = lines.index(f"## Blender {version}")
+    lines = lines[start_index + 1:]
+    for line in lines:
+        if not line.strip():
+            continue
+        if line.startswith("| **Report**"):
+            continue
+        if line.startswith("## Blender"):
+            break
+        if line.find("| -- |") != -1:
+            continue
+
+        log_line = ReleaseLogLine(line)
+        if log_line.issue_id:
+            issue_url = f"{base_url}/{log_line.issue_repo}/issues/{log_line.issue_id}"
+            response = requests.get(issue_url, headers=headers)
+            if response.status_code != 200:
+                raise ValueError("Issue not found: " + str(log_line.issue_id))
+
+            log_line.title = format_title(response.json()["title"])
+            yield log_line
+        elif log_line.commit_id:
+            commit_url = f"{base_url}/{log_line.commit_repo}/git/commits/{log_line.commit_id}"
+            response = requests.get(commit_url, headers=headers)
+            if response.status_code != 200:
+                raise ValueError("Commit not found: " + log_line.commit_id)
+
+            commit_message = response.json()['commit']['message']
+            commit_title = commit_message.split("\n")[0]
+            log_line.title = format_title(commit_title)
+            yield log_line
+
+
+def print_notes(version: str, format: str, issue: str):
+    """
+        Generate and print the release notes to the console.
+    """
+    if format == 'html':
+        print("<ul>")
+    if format == 'steam':
+        print("[ul]")
+    for log_item in extract_release_notes(version=version, issue=issue):
+        print(log_item.format(format=format))
+    if format == 'html':
+        print("</ul>")
+    if format == 'steam':
+        print("[/ul]")
--- a/release/lts/requirements.txt
+++ b/release/lts/requirements.txt
@ -1 +1 @@
-phabricator==0.7.0
+requests
--- a/release/scripts/addons
+++ b/release/scripts/addons
@ -1 +1 @@
-Subproject commit 534bf3b76c3b5f3bcd21641f1d53c1062bedcdbe
+Subproject commit b3f0ffc587d197b37eac9a1566d1d24b7bee7d9a
--- a/release/scripts/presets/keyconfig/keymap_data/blender_default.py
+++ b/release/scripts/presets/keyconfig/keymap_data/blender_default.py
@ -1299,6 +1299,8 @@ def km_uv_editor(params):
         {"properties": [("data_path", 'tool_settings.snap_uv_element')]}),
        ("wm.context_toggle", {"type": 'ACCENT_GRAVE', "value": 'PRESS', "ctrl": True},
         {"properties": [("data_path", 'space_data.show_gizmo')]}),
+        ("wm.context_toggle", {"type": 'Z', "value": 'PRESS', "alt": True, "shift": True},
+         {"properties": [("data_path", "space_data.overlay.show_overlays")]}),
        *_template_items_context_menu("IMAGE_MT_uvs_context_menu", params.context_menu_event),
    ])

@ -1968,6 +1970,8 @@ def km_image(params):
        ("image.clear_render_border", {"type": 'B', "value": 'PRESS', "ctrl": True, "alt": True}, None),
        ("wm.context_toggle", {"type": 'ACCENT_GRAVE', "value": 'PRESS', "ctrl": True},
         {"properties": [("data_path", 'space_data.show_gizmo')]}),
+        ("wm.context_toggle", {"type": 'Z', "value": 'PRESS', "alt": True, "shift": True},
+         {"properties": [("data_path", "space_data.overlay.show_overlays")]}),
        *_template_items_context_menu("IMAGE_MT_mask_context_menu", params.context_menu_event),
    ])

@ -2914,6 +2918,8 @@ def km_sequencer(params):
         {"properties": [("side", 'RIGHT')]}),
        ("wm.context_toggle", {"type": 'TAB', "value": 'PRESS', "shift": True},
         {"properties": [("data_path", 'tool_settings.use_snap_sequencer')]}),
+        ("wm.context_toggle", {"type": 'Z', "value": 'PRESS', "alt": True, "shift": True},
+         {"properties": [("data_path", "space_data.show_overlays")]}),
        *_template_items_context_menu("SEQUENCER_MT_context_menu", params.context_menu_event),
    ])

@ -6318,7 +6324,28 @@ def km_curve_pen_modal_map(_params):
    return keymap


+def km_node_link_modal_map(_params):
+    items = []
+    keymap = (
+        "Node Link Modal Map",
+        {"space_type": 'EMPTY', "region_type": 'WINDOW', "modal": True},
+        {"items": items},
+    )
+
+    items.extend([
+        ("BEGIN", {"type": 'LEFTMOUSE', "value": 'PRESS', "any": True}, None),
+        ("CONFIRM", {"type": 'LEFTMOUSE', "value": 'RELEASE', "any": True}, None),
+        ("CANCEL", {"type": 'RIGHTMOUSE', "value": 'PRESS', "any": True}, None),
+        ("CANCEL", {"type": 'ESC', "value": 'PRESS', "any": True}, None),
+        ("SWAP", {"type": 'LEFT_ALT', "value": 'ANY', "any": True}, None),
+        ("SWAP", {"type": 'RIGHT_ALT', "value": 'ANY', "any": True}, None),
+    ])
+
+    return keymap
+
 # Fallback for gizmos that don't have custom a custom key-map.
+
+
 def km_generic_gizmo(_params):
    keymap = (
        "Generic Gizmo",
@ -8087,6 +8114,7 @@ def generate_keymaps(params=None):
        km_paint_stroke_modal(params),
        km_sculpt_expand_modal(params),
        km_curve_pen_modal_map(params),
+        km_node_link_modal_map(params),

        # Gizmos.
        km_generic_gizmo(params),
--- a/release/scripts/startup/bl_ui/generic_ui_list.py
+++ b/release/scripts/startup/bl_ui/generic_ui_list.py
@ -1,7 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-or-later
-import bpy
-from bpy.types import Operator, UILayout, Context
-from bpy.props import EnumProperty, StringProperty

 """
 This module (in particular the draw_ui_list function) lets you draw the commonly
@ -15,54 +12,67 @@ You can get an example of how to use this via the Blender Text Editor->
 Templates->Ui List Generic.
 """

+import bpy
+from bpy.types import Operator
+from bpy.props import (
+    EnumProperty,
+    StringProperty,
+)
+
+__all__ = (
+    "draw_ui_list",
+)
+

 def draw_ui_list(
-        layout: UILayout,
-        context: Context,
+        layout,
+        context,
        class_name="UI_UL_list",
        *,
        unique_id="",
-        list_path: str,
-        active_idx_path: str,
+        list_path,
+        active_index_path,
        insertion_operators=True,
        move_operators=True,
        menu_class_name="",
-        **kwargs) -> UILayout:
+        **kwargs,
+):
    """
    Draw a UIList with Add/Remove/Move buttons and a menu.

-    :param layout:
-        UILayout to draw the list in.
-    :param context:
-        Blender context to get the list data from.
-    :param class_name:
-        Name of the UIList class to draw. The default is the
-        UIList class that ships with Blender.
-    :param unique_id:
-        Optional identifier, in case wanting to draw multiple unique copies of a list.
+    :arg layout: UILayout to draw the list in.
+    :type layout: :class:`UILayout`
+    :arg context: Blender context to get the list data from.
+    :type context: :class:`Context`
+    :arg class_name: Name of the UIList class to draw. The default is the UIList class that ships with Blender.
+    :type class_name: str
+    :arg unique_id: Optional identifier, in case wanting to draw multiple unique copies of a list.
+    :type unique_id: str
+    :arg list_path: Data path of the list relative to context, eg. "object.vertex_groups".
+    :type list_path: str
+    :arg active_index_path: Data path of the list active index integer relative to context,
+       eg. "object.vertex_groups.active_index".
+    :type active_index_path: str
+    :arg insertion_operators: Whether to draw Add/Remove buttons.
+    :type insertion_operators: bool
+    :arg move_operators: Whether to draw Move Up/Down buttons.
+    :type move_operators: str
+    :arg menu_class_name: Identifier of a Menu that should be drawn as a drop-down.
+    :type menu_class_name: str

-    :param list_path:
-        Data path of the list relative to context, eg. "object.vertex_groups".
-    :param active_idx_path:
-        Data path of the list active index integer relative to context,
-        eg. "object.vertex_groups.active_index".
+    :returns: The right side column.
+    :rtype: :class:`UILayout`.

-    :param insertion_operators:
-        Whether to draw Add/Remove buttons.
-    :param move_operators:
-        Whether to draw Move Up/Down buttons.
-    :param menu_class_name:
-        Name of a Menu that should be drawn as a drop-down.
-
-    Additional keyword arguments are passed to template_list().
+    Additional keyword arguments are passed to :class:`UIList.template_list`.
    """
+
    row = layout.row()

    list_owner_path, list_prop_name = list_path.rsplit('.', 1)
    list_owner = _get_context_attr(context, list_owner_path)

-    idx_owner_path, idx_prop_name = active_idx_path.rsplit('.', 1)
-    idx_owner = _get_context_attr(context, idx_owner_path)
+    index_owner_path, index_prop_name = active_index_path.rsplit('.', 1)
+    index_owner = _get_context_attr(context, index_owner_path)

    list_to_draw = _get_context_attr(context, list_path)

@ -70,8 +80,8 @@ def draw_ui_list(
        class_name,
        unique_id,
        list_owner, list_prop_name,
-        idx_owner, idx_prop_name,
-        rows=4 if len(list_to_draw) > 0 else 1,
+        index_owner, index_prop_name,
+        rows=4 if list_to_draw else 1,
        **kwargs
    )

@ -81,7 +91,7 @@ def draw_ui_list(
        _draw_add_remove_buttons(
            layout=col,
            list_path=list_path,
-            active_idx_path=active_idx_path,
+            active_index_path=active_index_path,
            list_length=len(list_to_draw)
        )
        layout.separator()
@ -90,11 +100,11 @@ def draw_ui_list(
        col.menu(menu_class_name, icon='DOWNARROW_HLT', text="")
        col.separator()

-    if move_operators and len(list_to_draw) > 0:
+    if move_operators and list_to_draw:
        _draw_move_buttons(
            layout=col,
            list_path=list_path,
-            active_idx_path=active_idx_path,
+            active_index_path=active_index_path,
            list_length=len(list_to_draw)
        )

@ -104,50 +114,50 @@ def draw_ui_list(

 def _draw_add_remove_buttons(
    *,
-    layout: UILayout,
-    list_path: str,
-    active_idx_path: str,
-    list_length: int
-) -> None:
+    layout,
+    list_path,
+    active_index_path,
+    list_length,
+):
    """Draw the +/- buttons to add and remove list entries."""
    add_op = layout.operator(UILIST_OT_entry_add.bl_idname, text="", icon='ADD')
    add_op.list_path = list_path
-    add_op.active_idx_path = active_idx_path
+    add_op.active_index_path = active_index_path

    row = layout.row()
    row.enabled = list_length > 0
    remove_op = row.operator(UILIST_OT_entry_remove.bl_idname, text="", icon='REMOVE')
    remove_op.list_path = list_path
-    remove_op.active_idx_path = active_idx_path
+    remove_op.active_index_path = active_index_path


 def _draw_move_buttons(
    *,
-    layout: UILayout,
-    list_path: str,
-    active_idx_path: str,
-    list_length: int
-) -> None:
+    layout,
+    list_path,
+    active_index_path,
+    list_length,
+):
    """Draw the up/down arrows to move elements in the list."""
    col = layout.column()
    col.enabled = list_length > 1
    move_up_op = layout.operator(UILIST_OT_entry_move.bl_idname, text="", icon='TRIA_UP')
    move_up_op.direction = 'UP'
    move_up_op.list_path = list_path
-    move_up_op.active_idx_path = active_idx_path
+    move_up_op.active_index_path = active_index_path

    move_down_op = layout.operator(UILIST_OT_entry_move.bl_idname, text="", icon='TRIA_DOWN')
    move_down_op.direction = 'DOWN'
    move_down_op.list_path = list_path
-    move_down_op.active_idx_path = active_idx_path
+    move_down_op.active_index_path = active_index_path


-def _get_context_attr(context: Context, data_path: str) -> object:
+def _get_context_attr(context, data_path):
    """Return the value of a context member based on its data path."""
    return context.path_resolve(data_path)


-def _set_context_attr(context: Context, data_path: str, value: object) -> None:
+def _set_context_attr(context, data_path, value) -> None:
    """Set the value of a context member based on its data path."""
    owner_path, attr_name = data_path.rsplit('.', 1)
    owner = context.path_resolve(owner_path)
@ -160,19 +170,18 @@ class GenericUIListOperator:
    bl_options = {'REGISTER', 'UNDO', 'INTERNAL'}

    list_path: StringProperty()
-    active_idx_path: StringProperty()
+    active_index_path: StringProperty()

    def get_list(self, context) -> str:
        return _get_context_attr(context, self.list_path)

    def get_active_index(self, context) -> str:
-        return _get_context_attr(context, self.active_idx_path)
+        return _get_context_attr(context, self.active_index_path)

    def set_active_index(self, context, index):
-        _set_context_attr(context, self.active_idx_path, index)
+        _set_context_attr(context, self.active_index_path, index)


-# noinspection PyPep8Naming
 class UILIST_OT_entry_remove(GenericUIListOperator, Operator):
    """Remove the selected entry from the list"""

@ -190,7 +199,6 @@ class UILIST_OT_entry_remove(GenericUIListOperator, Operator):
        return {'FINISHED'}


-# noinspection PyPep8Naming
 class UILIST_OT_entry_add(GenericUIListOperator, Operator):
    """Add an entry to the list after the current active item"""

@ -210,7 +218,6 @@ class UILIST_OT_entry_add(GenericUIListOperator, Operator):
        return {'FINISHED'}


-# noinspection PyPep8Naming
 class UILIST_OT_entry_move(GenericUIListOperator, Operator):
    """Move an entry in the list up or down"""

@ -219,8 +226,8 @@ class UILIST_OT_entry_move(GenericUIListOperator, Operator):

    direction: EnumProperty(
        name="Direction",
-        items=[('UP', 'UP', 'UP'),
-               ('DOWN', 'DOWN', 'DOWN')],
+        items=(('UP', 'UP', 'UP'),
+               ('DOWN', 'DOWN', 'DOWN')),
        default='UP'
    )

@ -241,9 +248,7 @@ class UILIST_OT_entry_move(GenericUIListOperator, Operator):
        return {'FINISHED'}


-# =============================================
-# Registration
-
+# Registration.
 classes = (
    UILIST_OT_entry_remove,
    UILIST_OT_entry_add,
--- a/release/scripts/startup/bl_ui/node_add_menu_geometry.py
+++ b/release/scripts/startup/bl_ui/node_add_menu_geometry.py
@ -46,6 +46,7 @@ class NODE_MT_geometry_node_GEO_CURVE(Menu):
    def draw(self, _context):
        layout = self.layout
        layout.menu("NODE_MT_geometry_node_GEO_CURVE_READ")
+        layout.menu("NODE_MT_geometry_node_GEO_CURVE_SAMPLE")
        layout.menu("NODE_MT_geometry_node_GEO_CURVE_WRITE")
        layout.separator()
        layout.menu("NODE_MT_geometry_node_GEO_CURVE_OPERATIONS")
@ -73,6 +74,16 @@ class NODE_MT_geometry_node_GEO_CURVE_READ(Menu):
        node_add_menu.draw_assets_for_catalog(layout, self.bl_label)


+class NODE_MT_geometry_node_GEO_CURVE_SAMPLE(Menu):
+    bl_idname = "NODE_MT_geometry_node_GEO_CURVE_SAMPLE"
+    bl_label = "Sample"
+
+    def draw(self, _context):
+        layout = self.layout
+        node_add_menu.add_node_type(layout, "GeometryNodeSampleCurve")
+        node_add_menu.draw_assets_for_catalog(layout, self.bl_label)
+
+
 class NODE_MT_geometry_node_GEO_CURVE_WRITE(Menu):
    bl_idname = "NODE_MT_geometry_node_GEO_CURVE_WRITE"
    bl_label = "Write"
@ -104,7 +115,6 @@ class NODE_MT_geometry_node_GEO_CURVE_OPERATIONS(Menu):
        node_add_menu.add_node_type(layout, "GeometryNodeInterpolateCurves")
        node_add_menu.add_node_type(layout, "GeometryNodeResampleCurve")
        node_add_menu.add_node_type(layout, "GeometryNodeReverseCurve")
-        node_add_menu.add_node_type(layout, "GeometryNodeSampleCurve")
        node_add_menu.add_node_type(layout, "GeometryNodeSubdivideCurve")
        node_add_menu.add_node_type(layout, "GeometryNodeTrimCurve")
        node_add_menu.draw_assets_for_catalog(layout, self.bl_label)
@ -146,10 +156,10 @@ class NODE_MT_geometry_node_GEO_GEOMETRY(Menu):
    def draw(self, _context):
        layout = self.layout
        layout.menu("NODE_MT_geometry_node_GEO_GEOMETRY_READ")
+        layout.menu("NODE_MT_geometry_node_GEO_GEOMETRY_SAMPLE")
        layout.menu("NODE_MT_geometry_node_GEO_GEOMETRY_WRITE")
        layout.separator()
        layout.menu("NODE_MT_geometry_node_GEO_GEOMETRY_OPERATIONS")
-        layout.menu("NODE_MT_geometry_node_GEO_GEOMETRY_SAMPLE")
        layout.separator()
        node_add_menu.add_node_type(layout, "GeometryNodeJoinGeometry")
        node_add_menu.add_node_type(layout, "GeometryNodeGeometryToInstance")
@ -309,6 +319,7 @@ class NODE_MT_geometry_node_GEO_MESH(Menu):
    def draw(self, _context):
        layout = self.layout
        layout.menu("NODE_MT_geometry_node_GEO_MESH_READ")
+        layout.menu("NODE_MT_geometry_node_GEO_MESH_SAMPLE")
        layout.menu("NODE_MT_geometry_node_GEO_MESH_WRITE")
        layout.separator()
        layout.menu("NODE_MT_geometry_node_GEO_MESH_OPERATIONS")
@ -338,6 +349,17 @@ class NODE_MT_geometry_node_GEO_MESH_READ(Menu):
        node_add_menu.draw_assets_for_catalog(layout, self.bl_label)


+class NODE_MT_geometry_node_GEO_MESH_SAMPLE(Menu):
+    bl_idname = "NODE_MT_geometry_node_GEO_MESH_SAMPLE"
+    bl_label = "Sample"
+
+    def draw(self, _context):
+        layout = self.layout
+        node_add_menu.add_node_type(layout, "GeometryNodeSampleNearestSurface")
+        node_add_menu.add_node_type(layout, "GeometryNodeSampleUVSurface")
+        node_add_menu.draw_assets_for_catalog(layout, self.bl_label)
+
+
 class NODE_MT_geometry_node_GEO_MESH_WRITE(Menu):
    bl_idname = "NODE_MT_geometry_node_GEO_MESH_WRITE"
    bl_label = "Write"
@ -363,8 +385,6 @@ class NODE_MT_geometry_node_GEO_MESH_OPERATIONS(Menu):
        node_add_menu.add_node_type(layout, "GeometryNodeMeshToCurve")
        node_add_menu.add_node_type(layout, "GeometryNodeMeshToPoints")
        node_add_menu.add_node_type(layout, "GeometryNodeMeshToVolume")
-        node_add_menu.add_node_type(layout, "GeometryNodeSampleNearestSurface")
-        node_add_menu.add_node_type(layout, "GeometryNodeSampleUVSurface")
        node_add_menu.add_node_type(layout, "GeometryNodeScaleElements")
        node_add_menu.add_node_type(layout, "GeometryNodeSplitEdges")
        node_add_menu.add_node_type(layout, "GeometryNodeSubdivideMesh")
@ -629,6 +649,7 @@ classes = (
    NODE_MT_category_GEO_OUTPUT,
    NODE_MT_geometry_node_GEO_CURVE,
    NODE_MT_geometry_node_GEO_CURVE_READ,
+    NODE_MT_geometry_node_GEO_CURVE_SAMPLE,
    NODE_MT_geometry_node_GEO_CURVE_WRITE,
    NODE_MT_geometry_node_GEO_CURVE_OPERATIONS,
    NODE_MT_geometry_node_GEO_PRIMITIVES_CURVE,
@ -641,6 +662,7 @@ classes = (
    NODE_MT_geometry_node_GEO_INSTANCE,
    NODE_MT_geometry_node_GEO_MESH,
    NODE_MT_geometry_node_GEO_MESH_READ,
+    NODE_MT_geometry_node_GEO_MESH_SAMPLE,
    NODE_MT_geometry_node_GEO_MESH_WRITE,
    NODE_MT_geometry_node_GEO_MESH_OPERATIONS,
    NODE_MT_category_GEO_UV,
--- a/release/scripts/startup/bl_ui/space_dopesheet.py
+++ b/release/scripts/startup/bl_ui/space_dopesheet.py
@ -708,6 +708,9 @@ class DOPESHEET_MT_channel_context_menu(Menu):
            operator = "action.extrapolation_type"
        layout.operator_menu_enum(operator, "type", text="Extrapolation Mode")

+        if is_graph_editor:
+            layout.operator_menu_enum("graph.fmodifier_add", "type", text="Add F-Curve Modifier").only_active = False
+
        layout.separator()
        layout.operator("anim.channels_expand")
        layout.operator("anim.channels_collapse")
--- a/release/scripts/startup/bl_ui/space_graph.py
+++ b/release/scripts/startup/bl_ui/space_graph.py
@ -222,6 +222,7 @@ class GRAPH_MT_channel(Menu):
        layout.separator()
        layout.operator("anim.channels_editable_toggle")
        layout.operator_menu_enum("graph.extrapolation_type", "type", text="Extrapolation Mode")
+        layout.operator_menu_enum("graph.fmodifier_add", "type", text="Add F-Curve Modifier").only_active = False

        layout.separator()
        layout.operator("graph.hide", text="Hide Selected Curves").unselected = False
--- a/release/scripts/startup/bl_ui/space_userpref.py
+++ b/release/scripts/startup/bl_ui/space_userpref.py
@ -2267,7 +2267,7 @@ class ExperimentalPanel:
    bl_region_type = 'WINDOW'
    bl_context = "experimental"

-    url_prefix = "https://developer.blender.org/"
+    url_prefix = "https://projects.blender.org/"

    @classmethod
    def poll(cls, _context):
@ -2308,8 +2308,8 @@ class USERPREF_PT_experimental_virtual_reality(ExperimentalPanel, Panel):
    def draw(self, context):
        self._draw_items(
            context, (
-                ({"property": "use_virtual_reality_scene_inspection"}, "T71347"),
-                ({"property": "use_virtual_reality_immersive_drawing"}, "T71348"),
+                ({"property": "use_virtual_reality_scene_inspection"}, ("blender/blender/issues/71347", "#71347")),
+                ({"property": "use_virtual_reality_immersive_drawing"}, ("blender/blender/issues/71348", "#71348")),
            ),
        )
 """
@ -2319,13 +2319,18 @@ class USERPREF_PT_experimental_new_features(ExperimentalPanel, Panel):
    bl_label = "New Features"

    def draw(self, context):
-        self._draw_items(
-            context, (
-                ({"property": "use_sculpt_tools_tilt"}, "T82877"),
-                ({"property": "use_extended_asset_browser"}, ("project/view/130/", "Project Page")),
-                ({"property": "use_override_templates"}, ("T73318", "Milestone 4")),
-            ),
-        )
+        self._draw_items(context,
+                         (({"property": "use_sculpt_tools_tilt"},
+                           ("blender/blender/issues/82877",
+                            "#82877")),
+                             ({"property": "use_extended_asset_browser"},
+                              ("blender/blender/projects/10",
+                                 "Pipeline, Assets & IO Project Page")),
+                             ({"property": "use_override_templates"},
+                              ("blender/blender/issues/73318",
+                                 "Milestone 4")),
+                          ),
+                         )


 class USERPREF_PT_experimental_prototypes(ExperimentalPanel, Panel):
@ -2334,12 +2339,12 @@ class USERPREF_PT_experimental_prototypes(ExperimentalPanel, Panel):
    def draw(self, context):
        self._draw_items(
            context, (
-                ({"property": "use_new_curves_tools"}, "T68981"),
-                ({"property": "use_new_point_cloud_type"}, "T75717"),
-                ({"property": "use_sculpt_texture_paint"}, "T96225"),
-                ({"property": "use_full_frame_compositor"}, "T88150"),
-                ({"property": "enable_eevee_next"}, "T93220"),
-                ({"property": "enable_workbench_next"}, "T101619"),
+                ({"property": "use_new_curves_tools"}, ("blender/blender/issues/68981", "#68981")),
+                ({"property": "use_new_point_cloud_type"}, ("blender/blender/issues/75717", "#75717")),
+                ({"property": "use_sculpt_texture_paint"}, ("blender/blender/issues/96225", "#96225")),
+                ({"property": "use_full_frame_compositor"}, ("blender/blender/issues/88150", "#88150")),
+                ({"property": "enable_eevee_next"}, ("blender/blender/issues/93220", "#93220")),
+                ({"property": "enable_workbench_next"}, ("blender/blender/issues/101619", "#101619")),
            ),
        )

@ -2352,7 +2357,7 @@ class USERPREF_PT_experimental_tweaks(ExperimentalPanel, Panel):
    def draw(self, context):
        self._draw_items(
            context, (
-                ({"property": "use_select_nearest_on_first_click"}, "T96752"),
+                ({"property": "use_select_nearest_on_first_click"}, ("blender/blender/issues/96752", "#96752")),
            ),
        )

@ -2371,8 +2376,8 @@ class USERPREF_PT_experimental_debugging(ExperimentalPanel, Panel):
    def draw(self, context):
        self._draw_items(
            context, (
-                ({"property": "use_undo_legacy"}, "T60695"),
-                ({"property": "override_auto_resync"}, "T83811"),
+                ({"property": "use_undo_legacy"}, ("blender/blender/issues/60695", "#60695")),
+                ({"property": "override_auto_resync"}, ("blender/blender/issues/83811", "#83811")),
                ({"property": "use_cycles_debug"}, None),
                ({"property": "show_asset_debug_info"}, None),
                ({"property": "use_asset_indexing"}, None),
--- a/release/scripts/templates_py/ui_list_generic.py
+++ b/release/scripts/templates_py/ui_list_generic.py
@ -19,7 +19,7 @@ class MyPanel(bpy.types.Panel):
            layout,
            context,
            list_context_path="scene.my_list",
-            active_idx_context_path="scene.my_list_active_idx"
+            active_index_context_path="scene.my_list_active_index"
        )


@ -34,13 +34,13 @@ class_register, class_unregister = bpy.utils.register_classes_factory(classes)
 def register():
    class_register()
    bpy.types.Scene.my_list = bpy.props.CollectionProperty(type=MyPropGroup)
-    bpy.types.Scene.my_list_active_idx = bpy.props.IntProperty()
+    bpy.types.Scene.my_list_active_index = bpy.props.IntProperty()


 def unregister():
    class_unregister()
    del bpy.types.Scene.my_list
-    del bpy.types.Scene.my_list_active_idx
+    del bpy.types.Scene.my_list_active_index


 register()
--- a/release/text/readme.html
+++ b/release/text/readme.html
@ -96,8 +96,8 @@ Chat <a href="https://blender.chat/channel/today">
 <p class="p5">
 <span class="s3">Development <a href="https://www.blender.org/get-involved/developers/">
 <span class="s4">www.blender.org/get-involved/developers/</span></a><br>
-GIT and Bug Tracker <a href="https://developer.blender.org/">
-<span class="s4">developer.blender.org</span></a><br>
+GIT and Bug Tracker <a href="https://projects.blender.org/">
+<span class="s4">projects.blender.org</span></a><br>
 Chat <a href="https://blender.chat/channel/blender-coders">
 <span class="s4">#blender-coders</span></a> on blender.chat</span>
 </p>
--- a/release/windows/batch/blender_debug_gpu.cmd
+++ b/release/windows/batch/blender_debug_gpu.cmd
@ -3,7 +3,7 @@ echo Starting blender with GPU debugging options, log files will be created
 echo in your temp folder, windows explorer will open after you close blender
 echo to help you find them.
 echo.
-echo If you report a bug on https://developer.blender.org you can attach these files
+echo If you report a bug on https://projects.blender.org you can attach these files
 echo by dragging them into the text area of your bug report, please include both
 echo blender_debug_output.txt and blender_system_info.txt in your report. 
 echo.
--- a/release/windows/batch/blender_debug_gpu_glitchworkaround.cmd
+++ b/release/windows/batch/blender_debug_gpu_glitchworkaround.cmd
@ -3,7 +3,7 @@ echo Starting blender with GPU debugging and glitch workaround options, log file
 echo will be created in your temp folder, windows explorer will open after you 
 echo close blender to help you find them.
 echo.
-echo If you report a bug on https://developer.blender.org you can attach these files
+echo If you report a bug on https://projects.blender.org you can attach these files
 echo by dragging them into the text area of your bug report, please include both
 echo blender_debug_output.txt and blender_system_info.txt in your report. 
 echo.
--- a/release/windows/batch/blender_debug_log.cmd
+++ b/release/windows/batch/blender_debug_log.cmd
@ -3,7 +3,7 @@ echo Starting blender with debug logging options, log files will be created
 echo in your temp folder, windows explorer will open after you close blender
 echo to help you find them.
 echo.
-echo If you report a bug on https://developer.blender.org you can attach these files
+echo If you report a bug on https://projects.blender.org you can attach these files
 echo by dragging them into the text area of your bug report, please include both
 echo blender_debug_output.txt and blender_system_info.txt in your report. 
 echo.
--- a/release/windows/batch/blender_factory_startup.cmd
+++ b/release/windows/batch/blender_factory_startup.cmd
@ -3,7 +3,7 @@ echo Starting blender with factory settings, log files will be created
 echo in your temp folder, windows explorer will open after you close blender
 echo to help you find them.
 echo.
-echo If you report a bug on https://developer.blender.org you can attach these files
+echo If you report a bug on https://projects.blender.org you can attach these files
 echo by dragging them into the text area of your bug report, please include both
 echo blender_debug_output.txt and blender_system_info.txt in your report. 
 echo.
--- a/source/blender/blenkernel/BKE_crazyspace.hh
+++ b/source/blender/blenkernel/BKE_crazyspace.hh
@ -6,8 +6,7 @@

 #pragma once

-#include "BLI_float3x3.hh"
-#include "BLI_math_vector_types.hh"
+#include "BLI_math_matrix.hh"
 #include "BLI_span.hh"

 struct Depsgraph;
@ -38,7 +37,7 @@ struct GeometryDeformation {
      return translation;
    }
    const float3x3 &deform_mat = this->deform_mats[position_i];
-    return deform_mat.inverted() * translation;
+    return math::transform_point(math::invert(deform_mat), translation);
  }
 };

--- a/source/blender/blenkernel/BKE_curves.hh
+++ b/source/blender/blenkernel/BKE_curves.hh
@ -13,10 +13,9 @@

 #include "BLI_bounds_types.hh"
 #include "BLI_cache_mutex.hh"
-#include "BLI_float3x3.hh"
-#include "BLI_float4x4.hh"
 #include "BLI_generic_virtual_array.hh"
 #include "BLI_index_mask.hh"
+#include "BLI_math_matrix_types.hh"
 #include "BLI_math_vector_types.hh"
 #include "BLI_offset_indices.hh"
 #include "BLI_shared_cache.hh"
@ -69,21 +68,24 @@ class CurvesGeometryRuntime {
   * Cache of offsets into the evaluated array for each curve, accounting for all previous
   * evaluated points, Bezier curve vector segments, different resolutions per curve, etc.
   */
-  mutable Vector<int> evaluated_offsets_cache;
-  mutable Vector<int> all_bezier_evaluated_offsets;
-  mutable CacheMutex offsets_cache_mutex;
+  struct EvaluatedOffsets {
+    Vector<int> evaluated_offsets;
+    Vector<int> all_bezier_offsets;
+  };
+  mutable SharedCache<EvaluatedOffsets> evaluated_offsets_cache;

-  mutable Vector<curves::nurbs::BasisCache> nurbs_basis_cache;
-  mutable CacheMutex nurbs_basis_cache_mutex;
+  mutable SharedCache<Vector<curves::nurbs::BasisCache>> nurbs_basis_cache;

  /** Cache of evaluated positions. */
-  mutable Vector<float3> evaluated_position_cache;
-  mutable CacheMutex position_cache_mutex;
-  /**
-   * The evaluated positions result, using a separate span in case all curves are poly curves,
-   * in which case a separate array of evaluated positions is unnecessary.
-   */
-  mutable Span<float3> evaluated_positions_span;
+  struct EvaluatedPositions {
+    Vector<float3> vector;
+    /**
+     * The evaluated positions result, using a separate span in case all curves are poly curves,
+     * in which case a separate array of evaluated positions is unnecessary.
+     */
+    Span<float3> span;
+  };
+  mutable SharedCache<EvaluatedPositions> evaluated_position_cache;

  /**
   * A cache of bounds shared between data-blocks with unchanged positions and radii.
@ -97,16 +99,13 @@ class CurvesGeometryRuntime {
   * cyclic, it needs one more length value to correspond to the last segment, so in order to
   * make slicing this array for a curve fast, an extra float is stored for every curve.
   */
-  mutable Vector<float> evaluated_length_cache;
-  mutable CacheMutex length_cache_mutex;
+  mutable SharedCache<Vector<float>> evaluated_length_cache;

  /** Direction of the curve at each evaluated point. */
-  mutable Vector<float3> evaluated_tangent_cache;
-  mutable CacheMutex tangent_cache_mutex;
+  mutable SharedCache<Vector<float3>> evaluated_tangent_cache;

  /** Normal direction vectors for each evaluated point. */
-  mutable Vector<float3> evaluated_normal_cache;
-  mutable CacheMutex normal_cache_mutex;
+  mutable SharedCache<Vector<float3>> evaluated_normal_cache;
 };

 /**
@ -866,7 +865,8 @@ inline Span<int> CurvesGeometry::bezier_evaluated_offsets_for_curve(const int cu
  const OffsetIndices points_by_curve = this->points_by_curve();
  const IndexRange points = points_by_curve[curve_index];
  const IndexRange range = curves::per_curve_point_offsets_range(points, curve_index);
-  return this->runtime->all_bezier_evaluated_offsets.as_span().slice(range);
+  const Span<int> offsets = this->runtime->evaluated_offsets_cache.data().all_bezier_offsets;
+  return offsets.slice(range);
 }

 inline IndexRange CurvesGeometry::lengths_range_for_curve(const int curve_index,
@ -881,9 +881,8 @@ inline IndexRange CurvesGeometry::lengths_range_for_curve(const int curve_index,
 inline Span<float> CurvesGeometry::evaluated_lengths_for_curve(const int curve_index,
                                                               const bool cyclic) const
 {
-  BLI_assert(this->runtime->length_cache_mutex.is_cached());
  const IndexRange range = this->lengths_range_for_curve(curve_index, cyclic);
-  return this->runtime->evaluated_length_cache.as_span().slice(range);
+  return this->runtime->evaluated_length_cache.data().as_span().slice(range);
 }

 inline float CurvesGeometry::evaluated_length_total_for_curve(const int curve_index,
--- a/Show More
+++ b/Show More