Merge branch 'master' into temp-image-engine

Rename some methods to represent their internal flag.
Add back UDIM support.
2022-12-07 15:04:39 +01:00 · 2022-12-07 15:03:36 +01:00 · 2022-12-07 15:01:17 +01:00 · 2022-12-07 13:56:21 +00:00 · 2022-12-07 14:47:11 +01:00 · 2022-12-07 14:44:45 +01:00
1488 changed files with 68431 additions and 37591 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1239,12 +1239,11 @@ if(WITH_OPENGL)
  add_definitions(-DWITH_OPENGL)
 endif()

-
-# -----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
 # Configure Vulkan.

 if(WITH_VULKAN_BACKEND)
-  add_definitions(-DWITH_VULKAN_BACKEND)
+  list(APPEND BLENDER_GL_LIBRARIES ${VULKAN_LIBRARIES})
 endif()

 # -----------------------------------------------------------------------------
--- a/build_files/build_environment/cmake/download.cmake
+++ b/build_files/build_environment/cmake/download.cmake
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-or-later

 ## Update and uncomment this in the release branch
-set(BLENDER_VERSION 3.4)
+# set(BLENDER_VERSION 3.1)

 function(download_source dep)
  set(TARGET_FILE ${${dep}_FILE})
--- a/build_files/cmake/Modules/FindMoltenVK.cmake
+++ b/build_files/cmake/Modules/FindMoltenVK.cmake
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022 Blender Foundation.
+
+# - Find MoltenVK libraries
+# Find the MoltenVK includes and libraries
+# This module defines
+#  MOLTENVK_INCLUDE_DIRS, where to find MoltenVK headers, Set when
+#                        MOLTENVK_INCLUDE_DIR is found.
+#  MOLTENVK_LIBRARIES, libraries to link against to use MoltenVK.
+#  MOLTENVK_ROOT_DIR, The base directory to search for MoltenVK.
+#                    This can also be an environment variable.
+#  MOLTENVK_FOUND, If false, do not try to use MoltenVK.
+#
+
+# If MOLTENVK_ROOT_DIR was defined in the environment, use it.
+IF(NOT MOLTENVK_ROOT_DIR AND NOT $ENV{MOLTENVK_ROOT_DIR} STREQUAL "")
+  SET(MOLTENVK_ROOT_DIR $ENV{MOLTENVK_ROOT_DIR})
+ENDIF()
+
+SET(_moltenvk_SEARCH_DIRS
+  ${MOLTENVK_ROOT_DIR}
+  ${LIBDIR}/vulkan/MoltenVK
+)
+
+
+FIND_PATH(MOLTENVK_INCLUDE_DIR
+  NAMES
+    MoltenVK/vk_mvk_moltenvk.h
+  HINTS
+    ${_moltenvk_SEARCH_DIRS}
+  PATH_SUFFIXES
+    include
+)
+
+FIND_LIBRARY(MOLTENVK_LIBRARY
+  NAMES
+    MoltenVK
+  HINTS
+    ${_moltenvk_SEARCH_DIRS}
+  PATH_SUFFIXES
+    dylib/macOS
+)
+
+# handle the QUIETLY and REQUIRED arguments and set MOLTENVK_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(MoltenVK DEFAULT_MSG MOLTENVK_LIBRARY MOLTENVK_INCLUDE_DIR)
+
+IF(MOLTENVK_FOUND)
+  SET(MOLTENVK_LIBRARIES ${MOLTENVK_LIBRARY})
+  SET(MOLTENVK_INCLUDE_DIRS ${MOLTENVK_INCLUDE_DIR})
+ENDIF()
+
+MARK_AS_ADVANCED(
+  MOLTENVK_INCLUDE_DIR
+  MOLTENVK_LIBRARY
+)
+
+UNSET(_moltenvk_SEARCH_DIRS)
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -100,6 +100,23 @@ if(WITH_USD)
  find_package(USD REQUIRED)
 endif()

+if(WITH_VULKAN_BACKEND)
+  find_package(MoltenVK REQUIRED)
+
+  if(EXISTS ${LIBDIR}/vulkan)
+    set(VULKAN_FOUND On)
+    set(VULKAN_ROOT_DIR ${LIBDIR}/vulkan/macOS)
+    set(VULKAN_INCLUDE_DIR ${VULKAN_ROOT_DIR}/include)
+    set(VULKAN_LIBRARY ${VULKAN_ROOT_DIR}/lib/libvulkan.1.dylib)
+
+    set(VULKAN_INCLUDE_DIRS ${VULKAN_INCLUDE_DIR} ${MOLTENVK_INCLUDE_DIRS})
+    set(VULKAN_LIBRARIES ${VULKAN_LIBRARY} ${MOLTENVK_LIBRARIES})
+  else()
+    message(WARNING "Vulkan SDK was not found, disabling WITH_VULKAN_BACKEND")
+    set(WITH_VULKAN_BACKEND OFF)
+  endif()
+endif()
+
 if(WITH_OPENSUBDIV)
  find_package(OpenSubdiv)
 endif()
--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@@ -108,6 +108,10 @@ find_package_wrapper(ZLIB REQUIRED)
 find_package_wrapper(Zstd REQUIRED)
 find_package_wrapper(Epoxy REQUIRED)

+if(WITH_VULKAN_BACKEND)
+  find_package_wrapper(Vulkan REQUIRED)
+endif()
+
 function(check_freetype_for_brotli)
  include(CheckSymbolExists)
  set(CMAKE_REQUIRED_INCLUDES ${FREETYPE_INCLUDE_DIRS})
@@ -322,9 +326,10 @@ if(WITH_CYCLES AND WITH_CYCLES_DEVICE_ONEAPI)
  file(GLOB _sycl_runtime_libraries
    ${SYCL_ROOT_DIR}/lib/libsycl.so
    ${SYCL_ROOT_DIR}/lib/libsycl.so.*
-    ${SYCL_ROOT_DIR}/lib/libpi_level_zero.so
+    ${SYCL_ROOT_DIR}/lib/libpi_*.so
  )
  list(FILTER _sycl_runtime_libraries EXCLUDE REGEX ".*\.py")
+  list(REMOVE_ITEM _sycl_runtime_libraries "${SYCL_ROOT_DIR}/lib/libpi_opencl.so")
  list(APPEND PLATFORM_BUNDLED_LIBRARIES ${_sycl_runtime_libraries})
  unset(_sycl_runtime_libraries)
 endif()
@@ -965,16 +970,9 @@ if(WITH_COMPILER_CCACHE)
  endif()
 endif()

-# On some platforms certain atomic operations are not possible with assembly and/or intrinsics and
-# they are emulated in software with locks. For example, on armel there is no intrinsics to grant
-# 64 bit atomic operations and STL library uses libatomic to offload software emulation of atomics
-# to.
-# This function will check whether libatomic is required and if so will configure linker flags.
-# If atomic operations are possible without libatomic then linker flags are left as-is.
-function(CONFIGURE_ATOMIC_LIB_IF_NEEDED)
-  # Source which is used to enforce situation when software emulation of atomics is required.
-  # Assume that using 64bit integer gives a definitive answer (as in, if 64bit atomic operations
-  # are possible using assembly/intrinsics 8, 16, and 32 bit operations will also be possible.
+# Always link with libatomic if available, as it is required for data types
+# which don't have intrinsics.
+function(configure_atomic_lib_if_needed)
  set(_source
      "#include <atomic>
      #include <cstdint>
@@ -985,25 +983,12 @@ function(CONFIGURE_ATOMIC_LIB_IF_NEEDED)
  )

  include(CheckCXXSourceCompiles)
-  check_cxx_source_compiles("${_source}" ATOMIC_OPS_WITHOUT_LIBATOMIC)
+  set(CMAKE_REQUIRED_LIBRARIES atomic)
+  check_cxx_source_compiles("${_source}" ATOMIC_OPS_WITH_LIBATOMIC)
+  unset(CMAKE_REQUIRED_LIBRARIES)

-  if(NOT ATOMIC_OPS_WITHOUT_LIBATOMIC)
-    # Compilation of the test program has failed.
-    # Try it again with -latomic to see if this is what is needed, or whether something else is
-    # going on.
-
-    set(CMAKE_REQUIRED_LIBRARIES atomic)
-    check_cxx_source_compiles("${_source}" ATOMIC_OPS_WITH_LIBATOMIC)
-    unset(CMAKE_REQUIRED_LIBRARIES)
-
-    if(ATOMIC_OPS_WITH_LIBATOMIC)
-      set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} -latomic" PARENT_SCOPE)
-    else()
-      # Atomic operations are required part of Blender and it is not possible to process forward.
-      # We expect that either standard library or libatomic will make atomics to work. If both
-      # cases has failed something fishy o na bigger scope is going on.
-      message(FATAL_ERROR "Failed to detect required configuration for atomic operations")
-    endif()
+  if(ATOMIC_OPS_WITH_LIBATOMIC)
+    set(PLATFORM_LINKFLAGS "${PLATFORM_LINKFLAGS} -latomic" PARENT_SCOPE)
  endif()
 endfunction()

--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -419,7 +419,7 @@ if(WITH_IMAGE_OPENEXR)
    warn_hardcoded_paths(OpenEXR)
    set(OPENEXR ${LIBDIR}/openexr)
    set(OPENEXR_INCLUDE_DIR ${OPENEXR}/include)
-    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${IMATH_INCLUDE_DIRS} ${OPENEXR}/include/OpenEXR)
+    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${IMATH_INCLUDE_DIRS} ${OPENEXR_INCLUDE_DIR}/OpenEXR)
    set(OPENEXR_LIBPATH ${OPENEXR}/lib)
    # Check if the 3.x library name exists
    # if not assume this is a 2.x library folder
@@ -568,7 +568,8 @@ if(WITH_OPENIMAGEIO)
  if(NOT OpenImageIO_FOUND)
    set(OPENIMAGEIO ${LIBDIR}/OpenImageIO)
    set(OPENIMAGEIO_LIBPATH ${OPENIMAGEIO}/lib)
-    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO}/include)
+    set(OPENIMAGEIO_INCLUDE_DIR ${OPENIMAGEIO}/include)
+    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO_INCLUDE_DIR})
    set(OIIO_OPTIMIZED optimized ${OPENIMAGEIO_LIBPATH}/OpenImageIO.lib optimized ${OPENIMAGEIO_LIBPATH}/OpenImageIO_Util.lib)
    set(OIIO_DEBUG debug ${OPENIMAGEIO_LIBPATH}/OpenImageIO_d.lib debug ${OPENIMAGEIO_LIBPATH}/OpenImageIO_Util_d.lib)
    set(OPENIMAGEIO_LIBRARIES ${OIIO_OPTIMIZED} ${OIIO_DEBUG})
@@ -785,6 +786,14 @@ if(WITH_CYCLES AND WITH_CYCLES_OSL)
  endif()
  find_path(OSL_INCLUDE_DIR OSL/oslclosure.h PATHS ${CYCLES_OSL}/include)
  find_program(OSL_COMPILER NAMES oslc PATHS ${CYCLES_OSL}/bin)
+  file(STRINGS "${OSL_INCLUDE_DIR}/OSL/oslversion.h" OSL_LIBRARY_VERSION_MAJOR
+       REGEX "^[ \t]*#define[ \t]+OSL_LIBRARY_VERSION_MAJOR[ \t]+[0-9]+.*$")
+  file(STRINGS "${OSL_INCLUDE_DIR}/OSL/oslversion.h" OSL_LIBRARY_VERSION_MINOR
+       REGEX "^[ \t]*#define[ \t]+OSL_LIBRARY_VERSION_MINOR[ \t]+[0-9]+.*$")
+  string(REGEX REPLACE ".*#define[ \t]+OSL_LIBRARY_VERSION_MAJOR[ \t]+([.0-9]+).*"
+         "\\1" OSL_LIBRARY_VERSION_MAJOR ${OSL_LIBRARY_VERSION_MAJOR})
+  string(REGEX REPLACE ".*#define[ \t]+OSL_LIBRARY_VERSION_MINOR[ \t]+([.0-9]+).*"
+         "\\1" OSL_LIBRARY_VERSION_MINOR ${OSL_LIBRARY_VERSION_MINOR})
 endif()

 if(WITH_CYCLES AND WITH_CYCLES_EMBREE)
@@ -917,6 +926,20 @@ if(WITH_HARU)
  set(HARU_LIBRARIES ${HARU_ROOT_DIR}/lib/libhpdfs.lib)
 endif()

+if(WITH_VULKAN_BACKEND)
+  if(EXISTS ${LIBDIR}/vulkan)
+    set(VULKAN_FOUND On)
+    set(VULKAN_ROOT_DIR ${LIBDIR}/vulkan)
+    set(VULKAN_INCLUDE_DIR ${VULKAN_ROOT_DIR}/include)
+    set(VULKAN_INCLUDE_DIRS ${VULKAN_INCLUDE_DIR})
+    set(VULKAN_LIBRARY ${VULKAN_ROOT_DIR}/lib/vulkan-1.lib)
+    set(VULKAN_LIBRARIES ${VULKAN_LIBRARY})
+  else()
+    message(WARNING "Vulkan SDK was not found, disabling WITH_VULKAN_BACKEND")
+    set(WITH_VULKAN_BACKEND OFF)
+  endif()
+endif()
+
 if(WITH_CYCLES AND WITH_CYCLES_PATH_GUIDING)
  find_package(openpgl QUIET)
  if(openpgl_FOUND)
@@ -949,7 +972,13 @@ if(WITH_CYCLES AND WITH_CYCLES_DEVICE_ONEAPI)
  endforeach()
  unset(_sycl_runtime_libraries_glob)

-  list(APPEND _sycl_runtime_libraries ${SYCL_ROOT_DIR}/bin/pi_level_zero.dll)
+  file(GLOB _sycl_pi_runtime_libraries_glob
+    ${SYCL_ROOT_DIR}/bin/pi_*.dll
+  )
+  list(REMOVE_ITEM _sycl_pi_runtime_libraries_glob "${SYCL_ROOT_DIR}/bin/pi_opencl.dll")
+  list (APPEND _sycl_runtime_libraries ${_sycl_pi_runtime_libraries_glob})
+  unset(_sycl_pi_runtime_libraries_glob)
+
  list(APPEND PLATFORM_BUNDLED_LIBRARIES ${_sycl_runtime_libraries})
  unset(_sycl_runtime_libraries)
 endif()
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -5,38 +5,38 @@
 update-code:
    git:
        submodules:
-        -   branch: blender-v3.4-release
+        -   branch: master
            commit_id: HEAD
            path: release/scripts/addons
-        -   branch: blender-v3.4-release
+        -   branch: master
            commit_id: HEAD
            path: release/scripts/addons_contrib
-        -   branch: blender-v3.4-release
+        -   branch: master
            commit_id: HEAD
            path: release/datafiles/locale
-        -   branch: blender-v3.4-release
+        -   branch: master
            commit_id: HEAD
            path: source/tools
    svn:
        libraries:
            darwin-arm64:
-                branch: tags/blender-3.4-release
+                branch: trunk
                commit_id: HEAD
                path: lib/darwin_arm64
            darwin-x86_64:
-                branch: tags/blender-3.4-release
+                branch: trunk
                commit_id: HEAD
                path: lib/darwin
            linux-x86_64:
-                branch: tags/blender-3.4-release
+                branch: trunk
                commit_id: HEAD
                path: lib/linux_centos7_x86_64
            windows-amd64:
-                branch: tags/blender-3.4-release
+                branch: trunk
                commit_id: HEAD
                path: lib/win64_vc15
        tests:
-            branch: tags/blender-3.4-release
+            branch: trunk
            commit_id: HEAD
            path: lib/tests
        benchmarks:
--- a/build_files/utils/make_bpy_wheel.py
+++ b/build_files/utils/make_bpy_wheel.py
@@ -69,6 +69,7 @@ Thanks to Tyler Alden Gubala for maintaining the original version of this packag
 # ------------------------------------------------------------------------------
 # Generic Functions

+
 def find_dominating_file(
    path: str,
    search: Sequence[str],
--- a/doc/doxygen/Doxyfile
+++ b/doc/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = Blender
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = V3.4
+PROJECT_NUMBER         = V3.5

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
--- a/doc/python_api/rst/info_gotcha.rst
+++ b/doc/python_api/rst/info_gotcha.rst
@@ -870,6 +870,26 @@ an issue but, due to internal implementation details, currently are:
  thus breaking any current iteration over ``Collection.all_objects``.


+.. rubric:: Do not:
+
+.. code-block:: python
+
+   # `all_objects` is an iterator. Using it directly while performing operations on its members that will update
+   # the memory accessed by the `all_objects` iterator will lead to invalid memory accesses and crashes.
+   for object in bpy.data.collections["Collection"].all_objects:
+        object.hide_viewport = True
+
+
+.. rubric:: Do:
+
+.. code-block:: python
+
+   # `all_objects[:]` is an independent list generated from the iterator. As long as no objects are deleted,
+   # its content will remain valid even if the data accessed by the `all_objects` iterator is modified.
+   for object in bpy.data.collections["Collection"].all_objects[:]:
+        object.hide_viewport = True
+
+
 sys.exit
 ========

--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -1294,6 +1294,7 @@ def pycontext2sphinx(basepath):

            type_descr = prop.get_type_description(
                class_fmt=":class:`bpy.types.%s`",
+                mathutils_fmt=":class:`mathutils.%s`",
                collection_id=_BPY_PROP_COLLECTION_ID,
                enum_descr_override=enum_descr_override,
            )
@@ -1446,6 +1447,7 @@ def pyrna2sphinx(basepath):
            identifier = " %s" % prop.identifier

        kwargs["class_fmt"] = ":class:`%s`"
+        kwargs["mathutils_fmt"] = ":class:`mathutils.%s`"

        kwargs["collection_id"] = _BPY_PROP_COLLECTION_ID

@@ -1565,6 +1567,7 @@ def pyrna2sphinx(basepath):

            type_descr = prop.get_type_description(
                class_fmt=":class:`%s`",
+                mathutils_fmt=":class:`mathutils.%s`",
                collection_id=_BPY_PROP_COLLECTION_ID,
                enum_descr_override=enum_descr_override,
            )
@@ -1631,6 +1634,7 @@ def pyrna2sphinx(basepath):

                    type_descr = prop.get_type_description(
                        as_ret=True, class_fmt=":class:`%s`",
+                        mathutils_fmt=":class:`mathutils.%s`",
                        collection_id=_BPY_PROP_COLLECTION_ID,
                        enum_descr_override=enum_descr_override,
                    )
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -91,3 +91,7 @@ endif()
 if(WITH_COMPOSITOR_CPU)
  add_subdirectory(smaa_areatex)
 endif()
+
+if(WITH_VULKAN_BACKEND)
+  add_subdirectory(vulkan_memory_allocator)
+endif()
--- a/extern/vulkan_memory_allocator/CMakeLists.txt
+++ b/extern/vulkan_memory_allocator/CMakeLists.txt
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright 2022 Blender Foundation. All rights reserved.
+
+set(INC
+  .
+)
+
+set(INC_SYS
+  ${VULKAN_INCLUDE_DIRS}
+)
+
+set(SRC
+  vk_mem_alloc_impl.cc
+
+  vk_mem_alloc.h
+)
+
+blender_add_lib(extern_vulkan_memory_allocator "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
+
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+  target_compile_options(extern_vulkan_memory_allocator
+    PRIVATE "-Wno-nullability-completeness"
+  )
+endif()
--- a/extern/vulkan_memory_allocator/LICENSE.txt
+++ b/extern/vulkan_memory_allocator/LICENSE.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/extern/vulkan_memory_allocator/README.blender
+++ b/extern/vulkan_memory_allocator/README.blender
@@ -0,0 +1,5 @@
+Project: VulkanMemoryAllocator
+URL: https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator
+License: MIT
+Upstream version: a6bfc23
+Local modifications: None
--- a/extern/vulkan_memory_allocator/README.md
+++ b/extern/vulkan_memory_allocator/README.md
@@ -0,0 +1,175 @@
+# Vulkan Memory Allocator
+
+Easy to integrate Vulkan memory allocation library.
+
+**Documentation:** Browse online: [Vulkan Memory Allocator](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/) (generated from Doxygen-style comments in [include/vk_mem_alloc.h](include/vk_mem_alloc.h))
+
+**License:** MIT. See [LICENSE.txt](LICENSE.txt)
+
+**Changelog:** See [CHANGELOG.md](CHANGELOG.md)
+
+**Product page:** [Vulkan Memory Allocator on GPUOpen](https://gpuopen.com/gaming-product/vulkan-memory-allocator/)
+
+**Build status:**
+
+- Windows: [![Build status](https://ci.appveyor.com/api/projects/status/4vlcrb0emkaio2pn/branch/master?svg=true)](https://ci.appveyor.com/project/adam-sawicki-amd/vulkanmemoryallocator/branch/master)  
+- Linux: [![Build Status](https://app.travis-ci.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.svg?branch=master)](https://app.travis-ci.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator)
+
+[![Average time to resolve an issue](http://isitmaintained.com/badge/resolution/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.svg)](http://isitmaintained.com/project/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator "Average time to resolve an issue")
+
+# Problem
+
+Memory allocation and resource (buffer and image) creation in Vulkan is difficult (comparing to older graphics APIs, like D3D11 or OpenGL) for several reasons:
+
+- It requires a lot of boilerplate code, just like everything else in Vulkan, because it is a low-level and high-performance API.
+- There is additional level of indirection: `VkDeviceMemory` is allocated separately from creating `VkBuffer`/`VkImage` and they must be bound together.
+- Driver must be queried for supported memory heaps and memory types. Different GPU vendors provide different types of it.
+- It is recommended to allocate bigger chunks of memory and assign parts of them to particular resources, as there is a limit on maximum number of memory blocks that can be allocated.
+
+# Features
+
+This library can help game developers to manage memory allocations and resource creation by offering some higher-level functions:
+
+1. Functions that help to choose correct and optimal memory type based on intended usage of the memory.
+   - Required or preferred traits of the memory are expressed using higher-level description comparing to Vulkan flags.
+2. Functions that allocate memory blocks, reserve and return parts of them (`VkDeviceMemory` + offset + size) to the user.
+   - Library keeps track of allocated memory blocks, used and unused ranges inside them, finds best matching unused ranges for new allocations, respects all the rules of alignment and buffer/image granularity.
+3. Functions that can create an image/buffer, allocate memory for it and bind them together - all in one call.
+
+Additional features:
+
+- Well-documented - description of all functions and structures provided, along with chapters that contain general description and example code.
+- Thread-safety: Library is designed to be used in multithreaded code. Access to a single device memory block referred by different buffers and textures (binding, mapping) is synchronized internally. Memory mapping is reference-counted.
+- Configuration: Fill optional members of `VmaAllocatorCreateInfo` structure to provide custom CPU memory allocator, pointers to Vulkan functions and other parameters.
+- Customization and integration with custom engines: Predefine appropriate macros to provide your own implementation of all external facilities used by the library like assert, mutex, atomic.
+- Support for memory mapping, reference-counted internally. Support for persistently mapped memory: Just allocate with appropriate flag and access the pointer to already mapped memory.
+- Support for non-coherent memory. Functions that flush/invalidate memory. `nonCoherentAtomSize` is respected automatically.
+- Support for resource aliasing (overlap).
+- Support for sparse binding and sparse residency: Convenience functions that allocate or free multiple memory pages at once.
+- Custom memory pools: Create a pool with desired parameters (e.g. fixed or limited maximum size) and allocate memory out of it.
+- Linear allocator: Create a pool with linear algorithm and use it for much faster allocations and deallocations in free-at-once, stack, double stack, or ring buffer fashion.
+- Support for Vulkan 1.0, 1.1, 1.2, 1.3.
+- Support for extensions (and equivalent functionality included in new Vulkan versions):
+   - VK_KHR_dedicated_allocation: Just enable it and it will be used automatically by the library.
+   - VK_KHR_buffer_device_address: Flag `VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR` is automatically added to memory allocations where needed.
+   - VK_EXT_memory_budget: Used internally if available to query for current usage and budget. If not available, it falls back to an estimation based on memory heap sizes.
+   - VK_EXT_memory_priority: Set `priority` of allocations or custom pools and it will be set automatically using this extension.
+   - VK_AMD_device_coherent_memory
+- Defragmentation of GPU and CPU memory: Let the library move data around to free some memory blocks and make your allocations better compacted.
+- Statistics: Obtain brief or detailed statistics about the amount of memory used, unused, number of allocated blocks, number of allocations etc. - globally, per memory heap, and per memory type.
+- Debug annotations: Associate custom `void* pUserData` and debug `char* pName` with each allocation.
+- JSON dump: Obtain a string in JSON format with detailed map of internal state, including list of allocations, their string names, and gaps between them.
+- Convert this JSON dump into a picture to visualize your memory. See [tools/GpuMemDumpVis](tools/GpuMemDumpVis/README.md).
+- Debugging incorrect memory usage: Enable initialization of all allocated memory with a bit pattern to detect usage of uninitialized or freed memory. Enable validation of a magic number after every allocation to detect out-of-bounds memory corruption.
+- Support for interoperability with OpenGL.
+- Virtual allocator: Interface for using core allocation algorithm to allocate any custom data, e.g. pieces of one large buffer.
+
+# Prerequisites
+
+- Self-contained C++ library in single header file. No external dependencies other than standard C and C++ library and of course Vulkan. Some features of C++14 used. STL containers, RTTI, or C++ exceptions are not used.
+- Public interface in C, in same convention as Vulkan API. Implementation in C++.
+- Error handling implemented by returning `VkResult` error codes - same way as in Vulkan.
+- Interface documented using Doxygen-style comments.
+- Platform-independent, but developed and tested on Windows using Visual Studio. Continuous integration setup for Windows and Linux. Used also on Android, MacOS, and other platforms.
+
+# Example
+
+Basic usage of this library is very simple. Advanced features are optional. After you created global `VmaAllocator` object, a complete code needed to create a buffer may look like this:
+
+```cpp
+VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
+bufferInfo.size = 65536;
+bufferInfo.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+
+VmaAllocationCreateInfo allocInfo = {};
+allocInfo.usage = VMA_MEMORY_USAGE_AUTO;
+
+VkBuffer buffer;
+VmaAllocation allocation;
+vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr);
+```
+
+With this one function call:
+
+1. `VkBuffer` is created.
+2. `VkDeviceMemory` block is allocated if needed.
+3. An unused region of the memory block is bound to this buffer.
+
+`VmaAllocation` is an object that represents memory assigned to this buffer. It can be queried for parameters like `VkDeviceMemory` handle and offset.
+
+# How to build
+
+On Windows it is recommended to use [CMake UI](https://cmake.org/runningcmake/). Alternatively you can generate a Visual Studio project map using CMake in command line: `cmake -B./build/ -DCMAKE_BUILD_TYPE=Debug -G "Visual Studio 16 2019" -A x64 ./`
+
+On Linux:
+
+```
+mkdir build
+cd build
+cmake ..
+make
+```
+
+The following targets are available
+
+| Target | Description | CMake option | Default setting |
+| ------------- | ------------- | ------------- | ------------- |
+| VmaSample | VMA sample application | `VMA_BUILD_SAMPLE` | `OFF` |
+| VmaBuildSampleShaders | Shaders for VmaSample | `VMA_BUILD_SAMPLE_SHADERS` | `OFF` |
+
+Please note that while VulkanMemoryAllocator library is supported on other platforms besides Windows, VmaSample is not.
+
+These CMake options are available
+
+| CMake option | Description | Default setting |
+| ------------- | ------------- | ------------- |
+| `VMA_RECORDING_ENABLED` | Enable VMA memory recording for debugging | `OFF` |
+| `VMA_USE_STL_CONTAINERS` | Use C++ STL containers instead of VMA's containers | `OFF` |
+| `VMA_STATIC_VULKAN_FUNCTIONS` | Link statically with Vulkan API | `OFF` |
+| `VMA_DYNAMIC_VULKAN_FUNCTIONS` | Fetch pointers to Vulkan functions internally (no static linking) | `ON` |
+| `VMA_DEBUG_ALWAYS_DEDICATED_MEMORY` | Every allocation will have its own memory block | `OFF` |
+| `VMA_DEBUG_INITIALIZE_ALLOCATIONS` | Automatically fill new allocations and destroyed allocations with some bit pattern | `OFF` |
+| `VMA_DEBUG_GLOBAL_MUTEX` | Enable single mutex protecting all entry calls to the library | `OFF` |
+| `VMA_DEBUG_DONT_EXCEED_MAX_MEMORY_ALLOCATION_COUNT` | Never exceed [VkPhysicalDeviceLimits::maxMemoryAllocationCount](https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#limits-maxMemoryAllocationCount) and return error | `OFF` |
+
+# Binaries
+
+The release comes with precompiled binary executable for "VulkanSample" application which contains test suite. It is compiled using Visual Studio 2019, so it requires appropriate libraries to work, including "MSVCP140.dll", "VCRUNTIME140.dll", "VCRUNTIME140_1.dll". If the launch fails with error message telling about those files missing, please download and install [Microsoft Visual C++ Redistributable for Visual Studio 2015, 2017 and 2019](https://support.microsoft.com/en-us/help/2977003/the-latest-supported-visual-c-downloads), "x64" version.
+
+# Read more
+
+See **[Documentation](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/)**.
+
+# Software using this library
+
+- **[X-Plane](https://x-plane.com/)**
+- **[Detroit: Become Human](https://gpuopen.com/learn/porting-detroit-3/)**
+- **[Vulkan Samples](https://github.com/LunarG/VulkanSamples)** - official Khronos Vulkan samples. License: Apache-style.
+- **[Anvil](https://github.com/GPUOpen-LibrariesAndSDKs/Anvil)** - cross-platform framework for Vulkan. License: MIT.
+- **[Filament](https://github.com/google/filament)** - physically based rendering engine for Android, Windows, Linux and macOS, from Google. Apache License 2.0.
+- **[Atypical Games - proprietary game engine](https://developer.samsung.com/galaxy-gamedev/gamedev-blog/infinitejet.html)**
+- **[Flax Engine](https://flaxengine.com/)**
+- **[Godot Engine](https://github.com/godotengine/godot/)** - multi-platform 2D and 3D game engine. License: MIT.
+- **[Lightweight Java Game Library (LWJGL)](https://www.lwjgl.org/)** - includes binding of the library for Java. License: BSD.
+- **[PowerVR SDK](https://github.com/powervr-graphics/Native_SDK)** - C++ cross-platform 3D graphics SDK, from Imagination. License: MIT.
+- **[Skia](https://github.com/google/skia)** - complete 2D graphic library for drawing Text, Geometries, and Images, from Google.
+- **[The Forge](https://github.com/ConfettiFX/The-Forge)** - cross-platform rendering framework. Apache License 2.0.
+- **[VK9](https://github.com/disks86/VK9)** - Direct3D 9 compatibility layer using Vulkan. Zlib lincese.
+- **[vkDOOM3](https://github.com/DustinHLand/vkDOOM3)** - Vulkan port of GPL DOOM 3 BFG Edition. License: GNU GPL.
+- **[vkQuake2](https://github.com/kondrak/vkQuake2)** - vanilla Quake 2 with Vulkan support. License: GNU GPL.
+- **[Vulkan Best Practice for Mobile Developers](https://github.com/ARM-software/vulkan_best_practice_for_mobile_developers)** from ARM. License: MIT.
+- **[RPCS3](https://github.com/RPCS3/rpcs3)** - PlayStation 3 emulator/debugger. License: GNU GPLv2.
+- **[PPSSPP](https://github.com/hrydgard/ppsspp)** - Playstation Portable emulator/debugger. License: GNU GPLv2+.
+
+[Many other projects on GitHub](https://github.com/search?q=AMD_VULKAN_MEMORY_ALLOCATOR_H&type=Code) and some game development studios that use Vulkan in their games.
+
+# See also
+
+- **[D3D12 Memory Allocator](https://github.com/GPUOpen-LibrariesAndSDKs/D3D12MemoryAllocator)** - equivalent library for Direct3D 12. License: MIT.
+- **[Awesome Vulkan](https://github.com/vinjn/awesome-vulkan)** - a curated list of awesome Vulkan libraries, debuggers and resources.
+- **[VulkanMemoryAllocator-Hpp](https://github.com/malte-v/VulkanMemoryAllocator-Hpp)** - C++ binding for this library. License: CC0-1.0.
+- **[PyVMA](https://github.com/realitix/pyvma)** - Python wrapper for this library. Author: Jean-Sébastien B. (@realitix). License: Apache 2.0.
+- **[vk-mem](https://github.com/gwihlidal/vk-mem-rs)** - Rust binding for this library. Author: Graham Wihlidal. License: Apache 2.0 or MIT.
+- **[Haskell bindings](https://hackage.haskell.org/package/VulkanMemoryAllocator)**, **[github](https://github.com/expipiplus1/vulkan/tree/master/VulkanMemoryAllocator)** - Haskell bindings for this library. Author: Ellie Hermaszewska (@expipiplus1). License BSD-3-Clause.
+- **[vma_sample_sdl](https://github.com/rextimmy/vma_sample_sdl)** - SDL port of the sample app of this library (with the goal of running it on multiple platforms, including MacOS). Author: @rextimmy. License: MIT.
+- **[vulkan-malloc](https://github.com/dylanede/vulkan-malloc)** - Vulkan memory allocation library for Rust. Based on version 1 of this library. Author: Dylan Ede (@dylanede). License: MIT / Apache 2.0.
--- a/extern/vulkan_memory_allocator/vk_mem_alloc.h
+++ b/extern/vulkan_memory_allocator/vk_mem_alloc.h
--- a/extern/vulkan_memory_allocator/vk_mem_alloc_impl.cc
+++ b/extern/vulkan_memory_allocator/vk_mem_alloc_impl.cc
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright 2022 Blender Foundation. All rights reserved. */
+
+#ifdef __APPLE__
+#  include <MoltenVK/vk_mvk_moltenvk.h>
+#else
+#  include <vulkan/vulkan.h>
+#endif
+
+#define VMA_IMPLEMENTATION
+
+#include "vk_mem_alloc.h"
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -253,6 +253,33 @@ if(WITH_CYCLES_OSL)
  )
 endif()

+if(WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX)
+  add_definitions(-DWITH_CUDA)
+
+  if(WITH_CUDA_DYNLOAD)
+    include_directories(
+      ../../extern/cuew/include
+    )
+    add_definitions(-DWITH_CUDA_DYNLOAD)
+  else()
+    include_directories(
+      SYSTEM
+      ${CUDA_TOOLKIT_INCLUDE}
+    )
+  endif()
+endif()
+
+if(WITH_CYCLES_DEVICE_HIP)
+  add_definitions(-DWITH_HIP)
+
+  if(WITH_HIP_DYNLOAD)
+    include_directories(
+      ../../extern/hipew/include
+    )
+    add_definitions(-DWITH_HIP_DYNLOAD)
+  endif()
+endif()
+
 if(WITH_CYCLES_DEVICE_OPTIX)
  find_package(OptiX 7.3.0)

@@ -261,12 +288,16 @@ if(WITH_CYCLES_DEVICE_OPTIX)
    include_directories(
      SYSTEM
      ${OPTIX_INCLUDE_DIR}
-      )
+    )
  else()
    set_and_warn_library_found("OptiX" OPTIX_FOUND WITH_CYCLES_DEVICE_OPTIX)
  endif()
 endif()

+if(WITH_CYCLES_DEVICE_METAL)
+  add_definitions(-DWITH_METAL)
+endif()
+
 if (WITH_CYCLES_DEVICE_ONEAPI)
  add_definitions(-DWITH_ONEAPI)
 endif()
--- a/intern/cycles/blender/addon/init.py
+++ b/intern/cycles/blender/addon/init.py
@@ -58,7 +58,7 @@ class CyclesRender(bpy.types.RenderEngine):
        if not self.session:
            if self.is_preview:
                cscene = bpy.context.scene.cycles
-                use_osl = cscene.shading_system and cscene.device == 'CPU'
+                use_osl = cscene.shading_system

                engine.create(self, data, preview_osl=use_osl)
            else:
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -156,6 +156,11 @@ def with_osl():
    return _cycles.with_osl


+def osl_version():
+    import _cycles
+    return _cycles.osl_version
+
+
 def with_path_guiding():
    import _cycles
    return _cycles.with_path_guiding
@@ -199,7 +204,6 @@ def list_render_passes(scene, srl):
    if crl.use_pass_volume_indirect:       yield ("VolumeInd",     "RGB",  'COLOR')
    if srl.use_pass_emit:                  yield ("Emit",          "RGB",  'COLOR')
    if srl.use_pass_environment:           yield ("Env",           "RGB",  'COLOR')
-    if srl.use_pass_shadow:                yield ("Shadow",        "RGB",  'COLOR')
    if srl.use_pass_ambient_occlusion:     yield ("AO",            "RGB",  'COLOR')
    if crl.use_pass_shadow_catcher:        yield ("Shadow Catcher",      "RGB",  'COLOR')
    # autopep8: on
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -86,6 +86,29 @@ enum_sampling_pattern = (
    ('PROGRESSIVE_MULTI_JITTER', "Progressive Multi-Jitter", "Use Progressive Multi-Jitter random sampling pattern", 1),
 )

+enum_emission_sampling = (
+    ('NONE',
+     'None',
+     "Do not use this surface as a light for sampling",
+     0),
+    ('AUTO',
+     'Auto',
+     "Automatically determine if the surface should be treated as a light for sampling, based on estimated emission intensity",
+     1),
+    ('FRONT',
+     'Front',
+     "Treat only front side of the surface as a light, usually for closed meshes whose interior is not visible",
+     2),
+    ('BACK',
+     'Back',
+     "Treat only back side of the surface as a light for sampling",
+     3),
+    ('FRONT_BACK',
+     'Front and Back',
+     "Treat surface as a light for sampling, emitting from both the front and back side",
+     4),
+)
+
 enum_volume_sampling = (
    ('DISTANCE',
     "Distance",
@@ -147,7 +170,6 @@ enum_view3d_shading_render_pass = (
    ('EMISSION', "Emission", "Show the Emission render pass"),
    ('BACKGROUND', "Background", "Show the Background render pass"),
    ('AO', "Ambient Occlusion", "Show the Ambient Occlusion render pass"),
-    ('SHADOW', "Shadow", "Show the Shadow render pass"),
    ('SHADOW_CATCHER', "Shadow Catcher", "Show the Shadow Catcher render pass"),

    ('', "Light", ""),
@@ -290,7 +312,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
    )
    shading_system: BoolProperty(
        name="Open Shading Language",
-        description="Use Open Shading Language (CPU rendering only)",
+        description="Use Open Shading Language",
    )

    preview_pause: BoolProperty(
@@ -481,6 +503,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        default='MULTIPLE_IMPORTANCE_SAMPLING',
    )

+    use_light_tree: BoolProperty(
+        name="Light Tree",
+        description="Sample multiple lights more efficiently based on estimated contribution at every shading point",
+        default=True,
+    )
+
    min_light_bounces: IntProperty(
        name="Min Light Bounces",
        description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, "
@@ -622,7 +650,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):

    transparent_max_bounces: IntProperty(
        name="Transparent Max Bounces",
-        description="Maximum number of transparent bounces. This is independent of maximum number of other bounces ",
+        description="Maximum number of transparent bounces. This is independent of maximum number of other bounces",
        min=0, max=1024,
        default=8,
    )
@@ -1043,13 +1071,13 @@ class CyclesCameraSettings(bpy.types.PropertyGroup):

 class CyclesMaterialSettings(bpy.types.PropertyGroup):

-    sample_as_light: BoolProperty(
-        name="Multiple Importance Sample",
-        description="Use multiple importance sampling for this material, "
-        "disabling may reduce overall noise for large "
-        "objects that emit little light compared to other light sources",
-        default=True,
+    emission_sampling: EnumProperty(
+        name="Emission Sampling",
+        description="Sampling strategy for emissive surfaces",
+        items=enum_emission_sampling,
+        default="AUTO",
    )
+
    use_transparent_shadow: BoolProperty(
        name="Transparent Shadows",
        description="Use transparent shadows for this material if it contains a Transparent BSDF, "
@@ -1642,7 +1670,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                    col.label(text="and Windows driver version 101.3430 or newer", icon='BLANK1')
                elif sys.platform.startswith("linux"):
                    col.label(text="Requires Intel GPU with Xe-HPG architecture and", icon='BLANK1')
-                    col.label(text="  - Linux driver version xx.xx.23904 or newer", icon='BLANK1')
+                    col.label(text="  - intel-level-zero-gpu version 1.3.23904 or newer", icon='BLANK1')
                    col.label(text="  - oneAPI Level-Zero Loader", icon='BLANK1')
            elif device_type == 'METAL':
                col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -383,7 +383,6 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
        col = layout.column(align=True)
        col.prop(cscene, "min_light_bounces")
        col.prop(cscene, "min_transparent_bounces")
-        col.prop(cscene, "light_sampling_threshold", text="Light Threshold")

        for view_layer in scene.view_layers:
            if view_layer.samples > 0:
@@ -392,6 +391,31 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
                break


+class CYCLES_RENDER_PT_sampling_lights(CyclesButtonsPanel, Panel):
+    bl_label = "Lights"
+    bl_parent_id = "CYCLES_RENDER_PT_sampling"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw_header(self, context):
+        layout = self.layout
+        scene = context.scene
+        cscene = scene.cycles
+
+    def draw(self, context):
+        layout = self.layout
+        layout.use_property_split = True
+        layout.use_property_decorate = False
+
+        scene = context.scene
+        cscene = scene.cycles
+
+        col = layout.column(align=True)
+        col.prop(cscene, "use_light_tree")
+        sub = col.row()
+        sub.prop(cscene, "light_sampling_threshold", text="Light Threshold")
+        sub.active = not cscene.use_light_tree
+
+
 class CYCLES_RENDER_PT_subdivision(CyclesButtonsPanel, Panel):
    bl_label = "Subdivision"
    bl_options = {'DEFAULT_CLOSED'}
@@ -954,7 +978,6 @@ class CYCLES_RENDER_PT_passes_light(CyclesButtonsPanel, Panel):
        col = layout.column(heading="Other", align=True)
        col.prop(view_layer, "use_pass_emit", text="Emission")
        col.prop(view_layer, "use_pass_environment")
-        col.prop(view_layer, "use_pass_shadow")
        col.prop(view_layer, "use_pass_ambient_occlusion", text="Ambient Occlusion")
        col.prop(cycles_view_layer, "use_pass_shadow_catcher")

@@ -1832,9 +1855,9 @@ class CYCLES_MATERIAL_PT_settings_surface(CyclesButtonsPanel, Panel):
        cmat = mat.cycles

        col = layout.column()
-        col.prop(cmat, "sample_as_light", text="Multiple Importance")
-        col.prop(cmat, "use_transparent_shadow")
        col.prop(cmat, "displacement_method", text="Displacement")
+        col.prop(cmat, "emission_sampling")
+        col.prop(cmat, "use_transparent_shadow")

    def draw(self, context):
        self.draw_shared(self, context.material)
@@ -2307,7 +2330,10 @@ def draw_device(self, context):
        col.prop(cscene, "device")

        from . import engine
-        if engine.with_osl() and use_cpu(context):
+        if engine.with_osl() and (
+                use_cpu(context) or
+                (use_optix(context) and (engine.osl_version()[1] >= 13 or engine.osl_version()[0] > 1))
+        ):
            col.prop(cscene, "shading_system")


@@ -2363,6 +2389,7 @@ classes = (
    CYCLES_RENDER_PT_sampling_render_denoise,
    CYCLES_RENDER_PT_sampling_path_guiding,
    CYCLES_RENDER_PT_sampling_path_guiding_debug,
+    CYCLES_RENDER_PT_sampling_lights,
    CYCLES_RENDER_PT_sampling_advanced,
    CYCLES_RENDER_PT_light_paths,
    CYCLES_RENDER_PT_light_paths_max_bounces,
--- a/intern/cycles/blender/addon/version_update.py
+++ b/intern/cycles/blender/addon/version_update.py
@@ -99,7 +99,7 @@ def do_versions(self):
        library_versions.setdefault(library.version, []).append(library)

    # Do versioning per library, since they might have different versions.
-    max_need_versioning = (3, 0, 25)
+    max_need_versioning = (3, 5, 2)
    for version, libraries in library_versions.items():
        if version > max_need_versioning:
            continue
@@ -297,3 +297,8 @@ def do_versions(self):
                cmat = mat.cycles
                if not cmat.is_property_set("displacement_method"):
                    cmat.displacement_method = 'DISPLACEMENT'
+
+            if version <= (3, 5, 3):
+                cmat = mat.cycles
+                if not cmat.get("sample_as_light", True):
+                    cmat.emission_sampling = 'NONE'
--- a/intern/cycles/blender/display_driver.cpp
+++ b/intern/cycles/blender/display_driver.cpp
--- a/intern/cycles/blender/display_driver.h
+++ b/intern/cycles/blender/display_driver.h
@@ -15,6 +15,10 @@
 #include "util/unique_ptr.h"
 #include "util/vector.h"

+typedef struct GPUContext GPUContext;
+typedef struct GPUFence GPUFence;
+typedef struct GPUShader GPUShader;
+
 CCL_NAMESPACE_BEGIN

 /* Base class of shader used for display driver rendering. */
@@ -29,7 +33,7 @@ class BlenderDisplayShader {
  BlenderDisplayShader() = default;
  virtual ~BlenderDisplayShader() = default;

-  virtual void bind(int width, int height) = 0;
+  virtual GPUShader *bind(int width, int height) = 0;
  virtual void unbind() = 0;

  /* Get attribute location for position and texture coordinate respectively.
@@ -40,7 +44,7 @@ class BlenderDisplayShader {
 protected:
  /* Get program of this display shader.
   * NOTE: The shader needs to be bound to have access to this. */
-  virtual uint get_shader_program() = 0;
+  virtual GPUShader *get_shader_program() = 0;

  /* Cached values of various OpenGL resources. */
  int position_attribute_location_ = -1;
@@ -51,16 +55,16 @@ class BlenderDisplayShader {
 * display space shader. */
 class BlenderFallbackDisplayShader : public BlenderDisplayShader {
 public:
-  virtual void bind(int width, int height) override;
+  virtual GPUShader *bind(int width, int height) override;
  virtual void unbind() override;

 protected:
-  virtual uint get_shader_program() override;
+  virtual GPUShader *get_shader_program() override;

  void create_shader_if_needed();
  void destroy_shader();

-  uint shader_program_ = 0;
+  GPUShader *shader_program_ = 0;
  int image_texture_location_ = -1;
  int fullscreen_location_ = -1;

@@ -73,17 +77,17 @@ class BlenderDisplaySpaceShader : public BlenderDisplayShader {
 public:
  BlenderDisplaySpaceShader(BL::RenderEngine &b_engine, BL::Scene &b_scene);

-  virtual void bind(int width, int height) override;
+  virtual GPUShader *bind(int width, int height) override;
  virtual void unbind() override;

 protected:
-  virtual uint get_shader_program() override;
+  virtual GPUShader *get_shader_program() override;

  BL::RenderEngine b_engine_;
  BL::Scene &b_scene_;

  /* Cached values of various OpenGL resources. */
-  uint shader_program_ = 0;
+  GPUShader *shader_program_ = nullptr;
 };

 /* Display driver implementation which is specific for Blender viewport integration. */
@@ -122,6 +126,9 @@ class BlenderDisplayDriver : public DisplayDriver {
  void gpu_context_lock();
  void gpu_context_unlock();

+  /* Create GPU resources used by the display driver. */
+  bool gpu_resources_create();
+
  /* Destroy all GPU resources which are being used by this object. */
  void gpu_resources_destroy();

@@ -137,8 +144,8 @@ class BlenderDisplayDriver : public DisplayDriver {
  struct Tiles;
  unique_ptr<Tiles> tiles_;

-  void *gl_render_sync_ = nullptr;
-  void *gl_upload_sync_ = nullptr;
+  GPUFence *gpu_render_sync_ = nullptr;
+  GPUFence *gpu_upload_sync_ = nullptr;

  float2 zoom_ = make_float2(1.0f, 1.0f);
 };
--- a/intern/cycles/blender/python.cpp
+++ b/intern/cycles/blender/python.cpp
@@ -18,7 +18,6 @@
 #include "util/guiding.h"
 #include "util/log.h"
 #include "util/md5.h"
-#include "util/opengl.h"
 #include "util/openimagedenoise.h"
 #include "util/path.h"
 #include "util/string.h"
@@ -26,6 +25,8 @@
 #include "util/tbb.h"
 #include "util/types.h"

+#include "GPU_state.h"
+
 #ifdef WITH_OSL
 #  include "scene/osl.h"

@@ -337,7 +338,7 @@ static PyObject *view_draw_func(PyObject * /*self*/, PyObject *args)
  if (PyLong_AsVoidPtr(pyrv3d)) {
    /* 3d view drawing */
    int viewport[4];
-    glGetIntegerv(GL_VIEWPORT, viewport);
+    GPU_viewport_size_get_i(viewport);

    session->view_draw(viewport[2], viewport[3]);
  }
--- a/intern/cycles/blender/session.cpp
+++ b/intern/cycles/blender/session.cpp
@@ -559,11 +559,6 @@ static bool bake_setup_pass(Scene *scene, const string &bake_type_str, const int
                                     0);
    integrator->set_use_emission((bake_filter & BL::BakeSettings::pass_filter_EMIT) != 0);
  }
-  /* Shadow pass. */
-  else if (strcmp(bake_type, "SHADOW") == 0) {
-    type = PASS_SHADOW;
-    use_direct_light = true;
-  }
  /* Light component passes. */
  else if (strcmp(bake_type, "DIFFUSE") == 0) {
    if ((bake_filter & BL::BakeSettings::pass_filter_DIRECT) &&
--- a/intern/cycles/blender/shader.cpp
+++ b/intern/cycles/blender/shader.cpp
@@ -61,6 +61,12 @@ static DisplacementMethod get_displacement_method(PointerRNA &ptr)
      ptr, "displacement_method", DISPLACE_NUM_METHODS, DISPLACE_BUMP);
 }

+static EmissionSampling get_emission_sampling(PointerRNA &ptr)
+{
+  return (EmissionSampling)get_enum(
+      ptr, "emission_sampling", EMISSION_SAMPLING_NUM, EMISSION_SAMPLING_AUTO);
+}
+
 static int validate_enum_value(int value, int num_values, int default_value)
 {
  if (value >= num_values) {
@@ -1559,7 +1565,7 @@ void BlenderSync::sync_materials(BL::Depsgraph &b_depsgraph, bool update_all)

      /* settings */
      PointerRNA cmat = RNA_pointer_get(&b_mat.ptr, "cycles");
-      shader->set_use_mis(get_boolean(cmat, "sample_as_light"));
+      shader->set_emission_sampling_method(get_emission_sampling(cmat));
      shader->set_use_transparent_shadow(get_boolean(cmat, "use_transparent_shadow"));
      shader->set_heterogeneous_volume(!get_boolean(cmat, "homogeneous_volume"));
      shader->set_volume_sampling_method(get_volume_sampling(cmat));
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -26,7 +26,6 @@
 #include "util/foreach.h"
 #include "util/hash.h"
 #include "util/log.h"
-#include "util/opengl.h"
 #include "util/openimagedenoise.h"

 CCL_NAMESPACE_BEGIN
@@ -348,7 +347,14 @@ void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)
    integrator->set_motion_blur(view_layer.use_motion_blur);
  }

-  integrator->set_light_sampling_threshold(get_float(cscene, "light_sampling_threshold"));
+  bool use_light_tree = get_boolean(cscene, "use_light_tree");
+  integrator->set_use_light_tree(use_light_tree);
+  integrator->set_light_sampling_threshold(
+      (use_light_tree) ? 0.0f : get_float(cscene, "light_sampling_threshold"));
+
+  if (integrator->use_light_tree_is_modified()) {
+    scene->light_manager->tag_update(scene, LightManager::UPDATE_ALL);
+  }

  SamplingPattern sampling_pattern = (SamplingPattern)get_enum(
      cscene, "sampling_pattern", SAMPLING_NUM_PATTERNS, SAMPLING_PATTERN_PMJ);
@@ -617,7 +623,6 @@ static bool get_known_pass_type(BL::RenderPass &b_pass, PassType &type, PassMode
  MAP_PASS("Emit", PASS_EMISSION, false);
  MAP_PASS("Env", PASS_BACKGROUND, false);
  MAP_PASS("AO", PASS_AO, false);
-  MAP_PASS("Shadow", PASS_SHADOW, false);

  MAP_PASS("BakePrimitive", PASS_BAKE_PRIMITIVE, false);
  MAP_PASS("BakeDifferential", PASS_BAKE_DIFFERENTIAL, false);
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -8,28 +8,13 @@ set(INC
 set(INC_SYS )

 if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
-  if(WITH_CUDA_DYNLOAD)
-    list(APPEND INC
-      ../../../extern/cuew/include
-    )
-    add_definitions(-DWITH_CUDA_DYNLOAD)
-  else()
-    list(APPEND INC_SYS
-      ${CUDA_TOOLKIT_INCLUDE}
-    )
+  if(NOT WITH_CUDA_DYNLOAD)
    add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
  endif()

  add_definitions(-DCYCLES_RUNTIME_OPTIX_ROOT_DIR="${CYCLES_RUNTIME_OPTIX_ROOT_DIR}")
 endif()

-if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
-  list(APPEND INC
-    ../../../extern/hipew/include
-  )
-  add_definitions(-DWITH_HIP_DYNLOAD)
-endif()
-
 set(SRC_BASE
  device.cpp
  denoise.cpp
@@ -168,24 +153,15 @@ if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
  )
 endif()

-if(WITH_CYCLES_DEVICE_CUDA)
-  add_definitions(-DWITH_CUDA)
-endif()
-if(WITH_CYCLES_DEVICE_HIP)
-  add_definitions(-DWITH_HIP)
-endif()
-if(WITH_CYCLES_DEVICE_OPTIX)
-  add_definitions(-DWITH_OPTIX)
-endif()
 if(WITH_CYCLES_DEVICE_METAL)
  list(APPEND LIB
    ${METAL_LIBRARY}
  )
-  add_definitions(-DWITH_METAL)
  list(APPEND SRC
    ${SRC_METAL}
  )
 endif()
+
 if (WITH_CYCLES_DEVICE_ONEAPI)
  if(WITH_CYCLES_ONEAPI_BINARIES)
    set(cycles_kernel_oneapi_lib_suffix "_aot")
@@ -203,7 +179,6 @@ if (WITH_CYCLES_DEVICE_ONEAPI)
  else()
    list(APPEND LIB ${SYCL_LIBRARY})
  endif()
-  add_definitions(-DWITH_ONEAPI)
  list(APPEND SRC
    ${SRC_ONEAPI}
  )
--- a/intern/cycles/device/cuda/graphics_interop.h
+++ b/intern/cycles/device/cuda/graphics_interop.h
@@ -38,7 +38,7 @@ class CUDADeviceGraphicsInterop : public DeviceGraphicsInterop {
  CUDADevice *device_ = nullptr;

  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
-  uint opengl_pbo_id_ = 0;
+  int64_t opengl_pbo_id_ = 0;
  /* Buffer area in pixels of the corresponding PBO. */
  int64_t buffer_area_ = 0;

--- a/intern/cycles/device/denoise.h
+++ b/intern/cycles/device/denoise.h
@@ -78,24 +78,4 @@ class DenoiseParams : public Node {
  }
 };

-/* All the parameters needed to perform buffer denoising on a device.
- * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
- * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
- * single place where they are all listed, so that it's not required to modify all device methods
- * when these parameters do change. */
-class DeviceDenoiseTask {
- public:
-  DenoiseParams params;
-
-  int num_samples;
-
-  RenderBuffers *render_buffers;
-  BufferParams buffer_params;
-
-  /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
-   * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
-   * tracer) point of view. */
-  bool allow_inplace_modification;
-};
-
 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -351,6 +351,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
  info.num = 0;

  info.has_nanovdb = true;
+  info.has_light_tree = true;
  info.has_osl = true;
  info.has_guiding = true;
  info.has_profiling = true;
@@ -399,6 +400,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,

    /* Accumulate device info. */
    info.has_nanovdb &= device.has_nanovdb;
+    info.has_light_tree &= device.has_light_tree;
    info.has_osl &= device.has_osl;
    info.has_guiding &= device.has_guiding;
    info.has_profiling &= device.has_profiling;
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -65,6 +65,7 @@ class DeviceInfo {
  int num;
  bool display_device;        /* GPU is used as a display device. */
  bool has_nanovdb;           /* Support NanoVDB volumes. */
+  bool has_light_tree;        /* Support light tree. */
  bool has_osl;               /* Support Open Shading Language. */
  bool has_guiding;           /* Support path guiding. */
  bool has_profiling;         /* Supports runtime collection of profiling info. */
@@ -84,6 +85,7 @@ class DeviceInfo {
    cpu_threads = 0;
    display_device = false;
    has_nanovdb = false;
+    has_light_tree = true;
    has_osl = false;
    has_guiding = false;
    has_profiling = false;
@@ -160,6 +162,11 @@ class Device {
    return true;
  }

+  virtual bool load_osl_kernels()
+  {
+    return true;
+  }
+
  /* GPU device only functions.
   * These may not be used on CPU or multi-devices. */

@@ -228,21 +235,6 @@ class Device {
    return nullptr;
  }

-  /* Buffer denoising. */
-
-  /* Returns true if task is fully handled. */
-  virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/)
-  {
-    LOG(ERROR) << "Request buffer denoising from a device which does not support it.";
-    return false;
-  }
-
-  virtual DeviceQueue *get_denoise_queue()
-  {
-    LOG(ERROR) << "Request denoising queue from a device which does not support it.";
-    return nullptr;
-  }
-
  /* Sub-devices */

  /* Run given callback for every individual device which will be handling rendering.
--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@@ -137,6 +137,7 @@ void device_hip_info(vector<DeviceInfo> &devices)
    info.num = num;

    info.has_nanovdb = true;
+    info.has_light_tree = false;
    info.denoisers = 0;

    info.has_gpu_queue = true;
--- a/intern/cycles/device/hip/graphics_interop.h
+++ b/intern/cycles/device/hip/graphics_interop.h
@@ -36,7 +36,7 @@ class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop {
  HIPDevice *device_ = nullptr;

  /* OpenGL PBO which is currently registered as the destination for the HIP buffer. */
-  uint opengl_pbo_id_ = 0;
+  int64_t opengl_pbo_id_ = 0;
  /* Buffer area in pixels of the corresponding PBO. */
  int64_t buffer_area_ = 0;

--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -7,6 +7,30 @@

 CCL_NAMESPACE_BEGIN

+bool device_kernel_has_shading(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY);
+}
+
+bool device_kernel_has_intersection(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
+}
+
 const char *device_kernel_as_string(DeviceKernel kernel)
 {
  switch (kernel) {
--- a/intern/cycles/device/kernel.h
+++ b/intern/cycles/device/kernel.h
@@ -11,6 +11,9 @@

 CCL_NAMESPACE_BEGIN

+bool device_kernel_has_shading(DeviceKernel kernel);
+bool device_kernel_has_intersection(DeviceKernel kernel);
+
 const char *device_kernel_as_string(DeviceKernel kernel);
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);

--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -117,6 +117,8 @@ class MetalDevice : public Device {
  /* ------------------------------------------------------------------ */
  /* low-level memory management */

+  bool max_working_set_exceeded(size_t safety_margin = 8 * 1024 * 1024) const;
+
  MetalMem *generic_alloc(device_memory &mem);

  void generic_copy_to(device_memory &mem);
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -446,6 +446,14 @@ void MetalDevice::erase_allocation(device_memory &mem)
  }
 }

+bool MetalDevice::max_working_set_exceeded(size_t safety_margin) const
+{
+  /* We're allowed to allocate beyond the safe working set size, but then if all resources are made
+   * resident we will get command buffer failures at render time. */
+  size_t available = [mtlDevice recommendedMaxWorkingSetSize] - safety_margin;
+  return (stats.mem_used > available);
+}
+
 MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
 {
  size_t size = mem.memory_size();
@@ -523,6 +531,11 @@ MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
    mmem->use_UMA = false;
  }

+  if (max_working_set_exceeded()) {
+    set_error("System is out of GPU memory");
+    return nullptr;
+  }
+
  return mmem;
 }

@@ -921,9 +934,8 @@ void MetalDevice::tex_alloc(device_texture &mem)
              << string_human_readable_size(mem.memory_size()) << ")";

    mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
-    assert(mtlTexture);
-
    if (!mtlTexture) {
+      set_error("System is out of GPU memory");
      return;
    }

@@ -955,7 +967,10 @@ void MetalDevice::tex_alloc(device_texture &mem)
              << string_human_readable_size(mem.memory_size()) << ")";

    mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
-    assert(mtlTexture);
+    if (!mtlTexture) {
+      set_error("System is out of GPU memory");
+      return;
+    }

    [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
                  mipmapLevel:0
@@ -1017,6 +1032,10 @@ void MetalDevice::tex_alloc(device_texture &mem)
  need_texture_info = true;

  texture_info[slot].data = uint64_t(slot) | (sampler_index << 32);
+
+  if (max_working_set_exceeded()) {
+    set_error("System is out of GPU memory");
+  }
 }

 void MetalDevice::tex_free(device_texture &mem)
@@ -1077,6 +1096,10 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
      }
    }
  }
+
+  if (max_working_set_exceeded()) {
+    set_error("System is out of GPU memory");
+  }
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -45,6 +45,36 @@ bool kernel_has_intersection(DeviceKernel device_kernel)
 struct ShaderCache {
  ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
  {
+    /* Initialize occupancy tuning LUT. */
+    if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) {
+      switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
+        default:
+        case APPLE_M2:
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
+          break;
+        case APPLE_M1:
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
+          break;
+      }
+    }
  }
  ~ShaderCache();

@@ -73,6 +103,11 @@ struct ShaderCache {
    std::function<void(MetalKernelPipeline *)> completionHandler;
  };

+  struct OccupancyTuningParameters {
+    int threads_per_threadgroup = 0;
+    int num_threads_per_block = 0;
+  } occupancy_tuning[DEVICE_KERNEL_NUM];
+
  std::mutex cache_mutex;

  PipelineCollection pipelines[DEVICE_KERNEL_NUM];
@@ -230,6 +265,13 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
  request.pipeline->device_kernel = device_kernel;
  request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;

+  if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
+    request.pipeline->threads_per_threadgroup =
+        occupancy_tuning[device_kernel].threads_per_threadgroup;
+    request.pipeline->num_threads_per_block =
+        occupancy_tuning[device_kernel].num_threads_per_block;
+  }
+
  /* metalrt options */
  request.pipeline->use_metalrt = device->use_metalrt;
  request.pipeline->metalrt_features = device->use_metalrt ?
@@ -384,13 +426,6 @@ void MetalKernelPipeline::compile()
  const std::string function_name = std::string("cycles_metal_") +
                                    device_kernel_as_string(device_kernel);

-  int threads_per_threadgroup = this->threads_per_threadgroup;
-  if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
-      device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
-    /* Always use 512 for the sorting kernels */
-    threads_per_threadgroup = 512;
-  }
-
  NSString *entryPoint = [@(function_name.c_str()) copy];

  NSError *error = NULL;
@@ -601,7 +636,9 @@ void MetalKernelPipeline::compile()
    metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
    path_create_directories(metalbin_path);

-    if (path_exists(metalbin_path) && use_binary_archive) {
+    /* Retrieve shader binary from disk, and update the file timestamp for LRU purging to work as
+     * intended. */
+    if (use_binary_archive && path_cache_kernel_exists_and_mark_used(metalbin_path)) {
      if (@available(macOS 11.0, *)) {
        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
        archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
@@ -662,12 +699,14 @@ void MetalKernelPipeline::compile()
      return;
    }

-    int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
-                                           computePipelineState.threadExecutionWidth);
-    num_threads_per_block = std::max(num_threads_per_block,
-                                     (int)computePipelineState.threadExecutionWidth);
+    if (!num_threads_per_block) {
+      num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+                                         computePipelineState.threadExecutionWidth);
+      num_threads_per_block = std::max(num_threads_per_block,
+                                       (int)computePipelineState.threadExecutionWidth);
+    }
+
    this->pipeline = computePipelineState;
-    this->num_threads_per_block = num_threads_per_block;

    if (@available(macOS 11.0, *)) {
      if (creating_new_archive || recreate_archive) {
@@ -676,6 +715,9 @@ void MetalKernelPipeline::compile()
          metal_printf("Failed to save binary archive, error:\n%s\n",
                       [[error localizedDescription] UTF8String]);
        }
+        else {
+          path_cache_kernel_mark_added_and_clear_old(metalbin_path);
+        }
      }
    }
  };
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -138,6 +138,15 @@ class MultiDevice : public Device {
    return true;
  }

+  bool load_osl_kernels() override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_osl_kernels())
+        return false;
+
+    return true;
+  }
+
  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
  {
    /* Try to build and share a single acceleration structure, if possible */
@@ -204,10 +213,12 @@ class MultiDevice : public Device {

  virtual void *get_cpu_osl_memory() override
  {
-    if (devices.size() > 1) {
+    /* Always return the OSL memory of the CPU device (this works since the constructor above
+     * guarantees that CPU devices are always added to the back). */
+    if (devices.size() > 1 && devices.back().device->info.type != DEVICE_CPU) {
      return NULL;
    }
-    return devices.front().device->get_cpu_osl_memory();
+    return devices.back().device->get_cpu_osl_memory();
  }

  bool is_resident(device_ptr key, Device *sub_device) override
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -31,6 +31,8 @@ bool device_oneapi_init()
   * improves stability as of intel/LLVM SYCL-nightly/20220529.
   * All these env variable can be set beforehand by end-users and
   * will in that case -not- be overwritten. */
+  /* By default, enable only Level-Zero and if all devices are allowed, also CUDA and HIP.
+   * OpenCL backend isn't currently well supported. */
 #  ifdef _WIN32
  if (getenv("SYCL_CACHE_PERSISTENT") == nullptr) {
    _putenv_s("SYCL_CACHE_PERSISTENT", "1");
@@ -39,7 +41,12 @@ bool device_oneapi_init()
    _putenv_s("SYCL_CACHE_THRESHOLD", "0");
  }
  if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
-    _putenv_s("SYCL_DEVICE_FILTER", "level_zero");
+    if (getenv("CYCLES_ONEAPI_ALL_DEVICES") == nullptr) {
+      _putenv_s("SYCL_DEVICE_FILTER", "level_zero");
+    }
+    else {
+      _putenv_s("SYCL_DEVICE_FILTER", "level_zero,cuda,hip");
+    }
  }
  if (getenv("SYCL_ENABLE_PCI") == nullptr) {
    _putenv_s("SYCL_ENABLE_PCI", "1");
@@ -50,7 +57,12 @@ bool device_oneapi_init()
 #  elif __linux__
  setenv("SYCL_CACHE_PERSISTENT", "1", false);
  setenv("SYCL_CACHE_THRESHOLD", "0", false);
-  setenv("SYCL_DEVICE_FILTER", "level_zero", false);
+  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") == nullptr) {
+    setenv("SYCL_DEVICE_FILTER", "level_zero", false);
+  }
+  else {
+    setenv("SYCL_DEVICE_FILTER", "level_zero,cuda,hip", false);
+  }
  setenv("SYCL_ENABLE_PCI", "1", false);
  setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
 #  endif
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -430,9 +430,9 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
  (void)usm_type;
  assert(usm_type == sycl::usm::alloc::device ||
-         ((device_type == sycl::info::device_type::cpu || allow_host) &&
-              usm_type == sycl::usm::alloc::host ||
-          usm_type == sycl::usm::alloc::unknown));
+         (usm_type == sycl::usm::alloc::host &&
+          (allow_host || device_type == sycl::info::device_type::cpu)) ||
+         usm_type == sycl::usm::alloc::unknown);
 #  else
  /* Silence warning about unused arguments. */
  (void)queue_;
--- a/intern/cycles/device/optix/device.cpp
+++ b/intern/cycles/device/optix/device.cpp
@@ -9,6 +9,10 @@

 #include "util/log.h"

+#ifdef WITH_OSL
+#  include <OSL/oslversion.h>
+#endif
+
 #ifdef WITH_OPTIX
 #  include <optix_function_table_definition.h>
 #endif
@@ -65,6 +69,9 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo

    info.type = DEVICE_OPTIX;
    info.id += "_OptiX";
+#  if defined(WITH_OSL) && (OSL_VERSION_MINOR >= 13 || OSL_VERSION_MAJOR > 1)
+    info.has_osl = true;
+#  endif
    info.denoisers |= DENOISER_OPTIX;

    devices.push_back(info);
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -1,16 +1,14 @@
 /* SPDX-License-Identifier: Apache-2.0
- * Copyright 2019, NVIDIA Corporation.
- * Copyright 2019-2022 Blender Foundation. */
+ * Copyright 2019, NVIDIA Corporation
+ * Copyright 2019-2022 Blender Foundation */

 #pragma once

 #ifdef WITH_OPTIX

 #  include "device/cuda/device_impl.h"
-#  include "device/optix/queue.h"
 #  include "device/optix/util.h"
-#  include "kernel/types.h"
-#  include "util/unique_ptr.h"
+#  include "kernel/osl/globals.h"

 CCL_NAMESPACE_BEGIN

@@ -23,8 +21,16 @@ enum {
  PG_RGEN_INTERSECT_SHADOW,
  PG_RGEN_INTERSECT_SUBSURFACE,
  PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_BACKGROUND,
+  PG_RGEN_SHADE_LIGHT,
+  PG_RGEN_SHADE_SURFACE,
  PG_RGEN_SHADE_SURFACE_RAYTRACE,
  PG_RGEN_SHADE_SURFACE_MNEE,
+  PG_RGEN_SHADE_VOLUME,
+  PG_RGEN_SHADE_SHADOW,
+  PG_RGEN_EVAL_DISPLACE,
+  PG_RGEN_EVAL_BACKGROUND,
+  PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY,
  PG_MISS,
  PG_HITD, /* Default hit group. */
  PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
@@ -40,14 +46,14 @@ enum {
 };

 static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
-static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int NUM_MISS_PROGRAM_GROUPS = 1;
 static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
 static const int NUM_HIT_PROGRAM_GROUPS = 8;
 static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
 static const int NUM_CALLABLE_PROGRAM_GROUPS = 2;

 /* List of OptiX pipelines. */
-enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES };
+enum { PIP_SHADE, PIP_INTERSECT, NUM_PIPELINES };

 /* A single shader binding table entry. */
 struct SbtRecord {
@@ -61,52 +67,35 @@ class OptiXDevice : public CUDADevice {
  OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
  OptixModule builtin_modules[2] = {};
  OptixPipeline pipelines[NUM_PIPELINES] = {};
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixPipelineCompileOptions pipeline_options = {};

-  bool motion_blur = false;
  device_vector<SbtRecord> sbt_data;
  device_only_memory<KernelParamsOptiX> launch_params;
-  OptixTraversableHandle tlas_handle = 0;

+#  ifdef WITH_OSL
+  OSLGlobals osl_globals;
+  vector<OptixModule> osl_modules;
+  vector<OptixProgramGroup> osl_groups;
+#  endif
+
+ private:
+  OptixTraversableHandle tlas_handle = 0;
  vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
  thread_mutex delayed_free_bvh_mutex;

-  class Denoiser {
-   public:
-    explicit Denoiser(OptiXDevice *device);
-
-    OptiXDevice *device;
-    OptiXDeviceQueue queue;
-
-    OptixDenoiser optix_denoiser = nullptr;
-
-    /* Configuration size, as provided to `optixDenoiserSetup`.
-     * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
-     * `is_configured` will be false. */
-    bool is_configured = false;
-    int2 configured_size = make_int2(0, 0);
-
-    /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
-     * The memory layout goes as following: [denoiser state][scratch buffer]. */
-    device_only_memory<unsigned char> state;
-    OptixDenoiserSizes sizes = {};
-
-    bool use_pass_albedo = false;
-    bool use_pass_normal = false;
-    bool use_pass_flow = false;
-  };
-  Denoiser denoiser_;
-
 public:
  OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
  ~OptiXDevice();

- private:
  BVHLayoutMask get_bvh_layout_mask() const override;

  string compile_kernel_get_common_cflags(const uint kernel_features);

  bool load_kernels(const uint kernel_features) override;

+  bool load_osl_kernels() override;
+
  bool build_optix_bvh(BVHOptiX *bvh,
                       OptixBuildOperation operation,
                       const OptixBuildInput &build_input,
@@ -123,52 +112,7 @@ class OptiXDevice : public CUDADevice {

  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;

-  /* --------------------------------------------------------------------
-   * Denoising.
-   */
-
-  class DenoiseContext;
-  class DenoisePass;
-
-  virtual bool denoise_buffer(const DeviceDenoiseTask &task) override;
-  virtual DeviceQueue *get_denoise_queue() override;
-
-  /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
-   * OptiX and store in the guiding passes memory within the given context.
-   *
-   * Pre=-processing of the guiding passes is to only happen once per context lifetime. DO not
-   * preprocess them for every pass which is being denoised. */
-  bool denoise_filter_guiding_preprocess(DenoiseContext &context);
-
-  /* Set fake albedo pixels in the albedo guiding pass storage.
-   * After this point only passes which do not need albedo for denoising can be processed. */
-  bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context);
-
-  void denoise_pass(DenoiseContext &context, PassType pass_type);
-
-  /* Read input color pass from the render buffer into the memory which corresponds to the noisy
-   * input within the given context. Pixels are scaled to the number of samples, but are not
-   * preprocessed yet. */
-  void denoise_color_read(DenoiseContext &context, const DenoisePass &pass);
-
-  /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
-   * denoiser result to the render buffer. */
-  bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass);
-  bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass);
-
-  /* Make sure the OptiX denoiser is created and configured. */
-  bool denoise_ensure(DenoiseContext &context);
-
-  /* Create OptiX denoiser descriptor if needed.
-   * Will do nothing if the current OptiX descriptor is usable for the given parameters.
-   * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
-  bool denoise_create_if_needed(DenoiseContext &context);
-
-  /* Configure existing OptiX denoiser descriptor for the use for the given task. */
-  bool denoise_configure_if_needed(DenoiseContext &context);
-
-  /* Run configured denoiser. */
-  bool denoise_run(DenoiseContext &context, const DenoisePass &pass);
+  void *get_cpu_osl_memory() override;
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -24,21 +24,33 @@ void OptiXDeviceQueue::init_execution()
  CUDADeviceQueue::init_execution();
 }

-static bool is_optix_specific_kernel(DeviceKernel kernel)
+static bool is_optix_specific_kernel(DeviceKernel kernel, bool use_osl)
 {
-  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+#  ifdef WITH_OSL
+  /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+  if (use_osl && device_kernel_has_shading(kernel)) {
+    return true;
+  }
+#  else
+  (void)use_osl;
+#  endif
+
+  return device_kernel_has_intersection(kernel);
 }

 bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                               const int work_size,
                               DeviceKernelArguments const &args)
 {
-  if (!is_optix_specific_kernel(kernel)) {
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+#  ifdef WITH_OSL
+  const bool use_osl = static_cast<OSLGlobals *>(optix_device->get_cpu_osl_memory())->use;
+#  else
+  const bool use_osl = false;
+#  endif
+
+  if (!is_optix_specific_kernel(kernel, use_osl)) {
    return CUDADeviceQueue::enqueue(kernel, work_size, args);
  }

@@ -50,8 +62,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,

  const CUDAContextScope scope(cuda_device_);

-  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
-
  const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
  const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;

@@ -62,9 +72,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                        sizeof(device_ptr),
                        cuda_stream_));

-  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel_has_shading(kernel)) {
    cuda_device_assert(
        cuda_device_,
        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
@@ -72,6 +80,15 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                          sizeof(device_ptr),
                          cuda_stream_));
  }
+  if (kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY) {
+    cuda_device_assert(cuda_device_,
+                       cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, offset),
+                                         args.values[2],  // &d_offset
+                                         sizeof(int32_t),
+                                         cuda_stream_));
+  }

  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));

@@ -79,14 +96,35 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
  OptixShaderBindingTable sbt_params = {};

  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_LIGHT * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE * sizeof(SbtRecord);
+      break;
    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
-      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
      break;
    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
-      pipeline = optix_device->pipelines[PIP_SHADE_MNEE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord);
      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_VOLUME * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SHADOW * sizeof(SbtRecord);
+      break;
+
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
      pipeline = optix_device->pipelines[PIP_INTERSECT];
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
@@ -104,6 +142,20 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
      break;

+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_DISPLACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr +
+                                PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY * sizeof(SbtRecord);
+      break;
+
    default:
      LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
                 << " is attempted to be enqueued.";
@@ -112,7 +164,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,

  sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
  sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.missRecordCount = NUM_MISS_PROGRAM_GROUPS;
  sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
  sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
  sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
@@ -120,6 +172,12 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
  sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
  sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);

+#  ifdef WITH_OSL
+  if (use_osl) {
+    sbt_params.callablesRecordCount += static_cast<unsigned int>(optix_device->osl_groups.size());
+  }
+#  endif
+
  /* Launch the ray generation program. */
  optix_device_assert(optix_device,
                      optixLaunch(pipeline,
--- a/intern/cycles/integrator/CMakeLists.txt
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -8,7 +8,7 @@ set(INC
 set(SRC
  adaptive_sampling.cpp
  denoiser.cpp
-  denoiser_device.cpp
+  denoiser_gpu.cpp
  denoiser_oidn.cpp
  denoiser_optix.cpp
  path_trace.cpp
@@ -30,7 +30,7 @@ set(SRC
 set(SRC_HEADERS
  adaptive_sampling.h
  denoiser.h
-  denoiser_device.h
+  denoiser_gpu.h
  denoiser_oidn.h
  denoiser_optix.h
  path_trace.h
--- a/intern/cycles/integrator/denoiser.cpp
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -16,9 +16,11 @@ unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoisePa
 {
  DCHECK(params.use);

+#ifdef WITH_OPTIX
  if (params.type == DENOISER_OPTIX && Device::available_devices(DEVICE_MASK_OPTIX).size()) {
    return make_unique<OptiXDenoiser>(path_trace_device, params);
  }
+#endif

  /* Always fallback to OIDN. */
  DenoiseParams oidn_params = params;
--- a/intern/cycles/integrator/denoiser_device.h
+++ b/intern/cycles/integrator/denoiser_device.h
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#pragma once
-
-#include "integrator/denoiser.h"
-#include "util/unique_ptr.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are
- * implemented as a part of a driver of specific device.
- *
- * This implementation makes sure the to-be-denoised buffer is available on the denoising device
- * and invoke denoising kernel via device API. */
-class DeviceDenoiser : public Denoiser {
- public:
-  DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params);
-  ~DeviceDenoiser();
-
-  virtual bool denoise_buffer(const BufferParams &buffer_params,
-                              RenderBuffers *render_buffers,
-                              const int num_samples,
-                              bool allow_inplace_modification) override;
-};
-
-CCL_NAMESPACE_END
--- a/intern/cycles/integrator/denoiser_device.cpp
+++ b/intern/cycles/integrator/denoiser_device.cpp
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#include "integrator/denoiser_device.h"
+#include "integrator/denoiser_gpu.h"

 #include "device/denoise.h"
 #include "device/device.h"
@@ -13,27 +13,27 @@

 CCL_NAMESPACE_BEGIN

-DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params)
+DenoiserGPU::DenoiserGPU(Device *path_trace_device, const DenoiseParams &params)
    : Denoiser(path_trace_device, params)
 {
 }

-DeviceDenoiser::~DeviceDenoiser()
+DenoiserGPU::~DenoiserGPU()
 {
  /* Explicit implementation, to allow forward declaration of Device in the header. */
 }

-bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
-                                    RenderBuffers *render_buffers,
-                                    const int num_samples,
-                                    bool allow_inplace_modification)
+bool DenoiserGPU::denoise_buffer(const BufferParams &buffer_params,
+                                 RenderBuffers *render_buffers,
+                                 const int num_samples,
+                                 bool allow_inplace_modification)
 {
  Device *denoiser_device = get_denoiser_device();
  if (!denoiser_device) {
    return false;
  }

-  DeviceDenoiseTask task;
+  DenoiseTask task;
  task.params = params_;
  task.num_samples = num_samples;
  task.buffer_params = buffer_params;
@@ -50,8 +50,6 @@ bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
  else {
    VLOG_WORK << "Creating temporary buffer on denoiser device.";

-    DeviceQueue *queue = denoiser_device->get_denoise_queue();
-
    /* Create buffer which is available by the device used by denoiser. */

    /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes,
@@ -70,13 +68,13 @@ bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
           render_buffers->buffer.data(),
           sizeof(float) * local_render_buffers.buffer.size());

-    queue->copy_to_device(local_render_buffers.buffer);
+    denoiser_queue_->copy_to_device(local_render_buffers.buffer);

    task.render_buffers = &local_render_buffers;
    task.allow_inplace_modification = true;
  }

-  const bool denoise_result = denoiser_device->denoise_buffer(task);
+  const bool denoise_result = denoise_buffer(task);

  if (local_buffer_used) {
    local_render_buffers.copy_from_device();
@@ -90,4 +88,21 @@ bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
  return denoise_result;
 }

+Device *DenoiserGPU::ensure_denoiser_device(Progress *progress)
+{
+  Device *denoiser_device = Denoiser::ensure_denoiser_device(progress);
+  if (!denoiser_device) {
+    return nullptr;
+  }
+
+  if (!denoiser_queue_) {
+    denoiser_queue_ = denoiser_device->gpu_queue_create();
+    if (!denoiser_queue_) {
+      return nullptr;
+    }
+  }
+
+  return denoiser_device;
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/integrator/denoiser_gpu.h
+++ b/intern/cycles/integrator/denoiser_gpu.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Implementation of Denoiser which uses a device-specific denoising implementation, running on a
+ * GPU device queue. It makes sure the to-be-denoised buffer is available on the denoising device
+ * and invokes denoising kernels via the device queue API. */
+class DenoiserGPU : public Denoiser {
+ public:
+  DenoiserGPU(Device *path_trace_device, const DenoiseParams &params);
+  ~DenoiserGPU();
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+
+ protected:
+  /* All the parameters needed to perform buffer denoising on a device.
+   * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
+   * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
+   * single place where they are all listed, so that it's not required to modify all device methods
+   * when these parameters do change. */
+  class DenoiseTask {
+   public:
+    DenoiseParams params;
+
+    int num_samples;
+
+    RenderBuffers *render_buffers;
+    BufferParams buffer_params;
+
+    /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
+     * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
+     * tracer) point of view. */
+    bool allow_inplace_modification;
+  };
+
+  /* Returns true if task is fully handled. */
+  virtual bool denoise_buffer(const DenoiseTask & /*task*/) = 0;
+
+  virtual Device *ensure_denoiser_device(Progress *progress) override;
+
+  unique_ptr<DeviceQueue> denoiser_queue_;
+};
+
+CCL_NAMESPACE_END
--- a/intern/cycles/integrator/denoiser_optix.cpp
+++ b/intern/cycles/integrator/denoiser_optix.cpp
@@ -1,16 +1,216 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#include "integrator/denoiser_optix.h"
+#ifdef WITH_OPTIX

-#include "device/denoise.h"
-#include "device/device.h"
+#  include "integrator/denoiser_optix.h"
+#  include "integrator/pass_accessor_gpu.h"
+
+#  include "device/optix/device_impl.h"
+#  include "device/optix/queue.h"
+
+#  include <optix_denoiser_tiling.h>

 CCL_NAMESPACE_BEGIN

-OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
-    : DeviceDenoiser(path_trace_device, params)
+#  if OPTIX_ABI_VERSION >= 60
+using ::optixUtilDenoiserInvokeTiled;
+#  else
+// A minimal copy of functionality `optix_denoiser_tiling.h` which allows to fix integer overflow
+// issues without bumping SDK or driver requirement.
+//
+// The original code is Copyright NVIDIA Corporation, BSD-3-Clause.
+static OptixResult optixUtilDenoiserSplitImage(const OptixImage2D &input,
+                                               const OptixImage2D &output,
+                                               unsigned int overlapWindowSizeInPixels,
+                                               unsigned int tileWidth,
+                                               unsigned int tileHeight,
+                                               std::vector<OptixUtilDenoiserImageTile> &tiles)
 {
+  if (tileWidth == 0 || tileHeight == 0)
+    return OPTIX_ERROR_INVALID_VALUE;
+
+  unsigned int inPixelStride = optixUtilGetPixelStride(input);
+  unsigned int outPixelStride = optixUtilGetPixelStride(output);
+
+  int inp_w = std::min(tileWidth + 2 * overlapWindowSizeInPixels, input.width);
+  int inp_h = std::min(tileHeight + 2 * overlapWindowSizeInPixels, input.height);
+  int inp_y = 0, copied_y = 0;
+
+  do {
+    int inputOffsetY = inp_y == 0 ? 0 :
+                                    std::max((int)overlapWindowSizeInPixels,
+                                             inp_h - ((int)input.height - inp_y));
+    int copy_y = inp_y == 0 ? std::min(input.height, tileHeight + overlapWindowSizeInPixels) :
+                              std::min(tileHeight, input.height - copied_y);
+
+    int inp_x = 0, copied_x = 0;
+    do {
+      int inputOffsetX = inp_x == 0 ? 0 :
+                                      std::max((int)overlapWindowSizeInPixels,
+                                               inp_w - ((int)input.width - inp_x));
+      int copy_x = inp_x == 0 ? std::min(input.width, tileWidth + overlapWindowSizeInPixels) :
+                                std::min(tileWidth, input.width - copied_x);
+
+      OptixUtilDenoiserImageTile tile;
+      tile.input.data = input.data + (size_t)(inp_y - inputOffsetY) * input.rowStrideInBytes +
+                        +(size_t)(inp_x - inputOffsetX) * inPixelStride;
+      tile.input.width = inp_w;
+      tile.input.height = inp_h;
+      tile.input.rowStrideInBytes = input.rowStrideInBytes;
+      tile.input.pixelStrideInBytes = input.pixelStrideInBytes;
+      tile.input.format = input.format;
+
+      tile.output.data = output.data + (size_t)inp_y * output.rowStrideInBytes +
+                         (size_t)inp_x * outPixelStride;
+      tile.output.width = copy_x;
+      tile.output.height = copy_y;
+      tile.output.rowStrideInBytes = output.rowStrideInBytes;
+      tile.output.pixelStrideInBytes = output.pixelStrideInBytes;
+      tile.output.format = output.format;
+
+      tile.inputOffsetX = inputOffsetX;
+      tile.inputOffsetY = inputOffsetY;
+      tiles.push_back(tile);
+
+      inp_x += inp_x == 0 ? tileWidth + overlapWindowSizeInPixels : tileWidth;
+      copied_x += copy_x;
+    } while (inp_x < static_cast<int>(input.width));
+
+    inp_y += inp_y == 0 ? tileHeight + overlapWindowSizeInPixels : tileHeight;
+    copied_y += copy_y;
+  } while (inp_y < static_cast<int>(input.height));
+
+  return OPTIX_SUCCESS;
+}
+
+static OptixResult optixUtilDenoiserInvokeTiled(OptixDenoiser denoiser,
+                                                CUstream stream,
+                                                const OptixDenoiserParams *params,
+                                                CUdeviceptr denoiserState,
+                                                size_t denoiserStateSizeInBytes,
+                                                const OptixDenoiserGuideLayer *guideLayer,
+                                                const OptixDenoiserLayer *layers,
+                                                unsigned int numLayers,
+                                                CUdeviceptr scratch,
+                                                size_t scratchSizeInBytes,
+                                                unsigned int overlapWindowSizeInPixels,
+                                                unsigned int tileWidth,
+                                                unsigned int tileHeight)
+{
+  if (!guideLayer || !layers)
+    return OPTIX_ERROR_INVALID_VALUE;
+
+  std::vector<std::vector<OptixUtilDenoiserImageTile>> tiles(numLayers);
+  std::vector<std::vector<OptixUtilDenoiserImageTile>> prevTiles(numLayers);
+  for (unsigned int l = 0; l < numLayers; l++) {
+    if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(layers[l].input,
+                                                                 layers[l].output,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth,
+                                                                 tileHeight,
+                                                                 tiles[l]))
+      return res;
+
+    if (layers[l].previousOutput.data) {
+      OptixImage2D dummyOutput = layers[l].previousOutput;
+      if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(layers[l].previousOutput,
+                                                                   dummyOutput,
+                                                                   overlapWindowSizeInPixels,
+                                                                   tileWidth,
+                                                                   tileHeight,
+                                                                   prevTiles[l]))
+        return res;
+    }
+  }
+
+  std::vector<OptixUtilDenoiserImageTile> albedoTiles;
+  if (guideLayer->albedo.data) {
+    OptixImage2D dummyOutput = guideLayer->albedo;
+    if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(guideLayer->albedo,
+                                                                 dummyOutput,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth,
+                                                                 tileHeight,
+                                                                 albedoTiles))
+      return res;
+  }
+
+  std::vector<OptixUtilDenoiserImageTile> normalTiles;
+  if (guideLayer->normal.data) {
+    OptixImage2D dummyOutput = guideLayer->normal;
+    if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(guideLayer->normal,
+                                                                 dummyOutput,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth,
+                                                                 tileHeight,
+                                                                 normalTiles))
+      return res;
+  }
+  std::vector<OptixUtilDenoiserImageTile> flowTiles;
+  if (guideLayer->flow.data) {
+    OptixImage2D dummyOutput = guideLayer->flow;
+    if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(guideLayer->flow,
+                                                                 dummyOutput,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth,
+                                                                 tileHeight,
+                                                                 flowTiles))
+      return res;
+  }
+
+  for (size_t t = 0; t < tiles[0].size(); t++) {
+    std::vector<OptixDenoiserLayer> tlayers;
+    for (unsigned int l = 0; l < numLayers; l++) {
+      OptixDenoiserLayer layer = {};
+      layer.input = (tiles[l])[t].input;
+      layer.output = (tiles[l])[t].output;
+      if (layers[l].previousOutput.data)
+        layer.previousOutput = (prevTiles[l])[t].input;
+      tlayers.push_back(layer);
+    }
+
+    OptixDenoiserGuideLayer gl = {};
+    if (guideLayer->albedo.data)
+      gl.albedo = albedoTiles[t].input;
+
+    if (guideLayer->normal.data)
+      gl.normal = normalTiles[t].input;
+
+    if (guideLayer->flow.data)
+      gl.flow = flowTiles[t].input;
+
+    if (const OptixResult res = optixDenoiserInvoke(denoiser,
+                                                    stream,
+                                                    params,
+                                                    denoiserState,
+                                                    denoiserStateSizeInBytes,
+                                                    &gl,
+                                                    &tlayers[0],
+                                                    numLayers,
+                                                    (tiles[0])[t].inputOffsetX,
+                                                    (tiles[0])[t].inputOffsetY,
+                                                    scratch,
+                                                    scratchSizeInBytes))
+      return res;
+  }
+  return OPTIX_SUCCESS;
+}
+#  endif
+
+OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : DenoiserGPU(path_trace_device, params), state_(path_trace_device, "__denoiser_state", true)
+{
+}
+
+OptiXDenoiser::~OptiXDenoiser()
+{
+  /* It is important that the OptixDenoiser handle is destroyed before the OptixDeviceContext
+   * handle, which is guaranteed since the local denoising device owning the OptiX device context
+   * is deleted as part of the Denoiser class destructor call after this. */
+  if (optix_denoiser_ != nullptr) {
+    optixDenoiserDestroy(optix_denoiser_);
+  }
 }

 uint OptiXDenoiser::get_device_type_mask() const
@@ -18,4 +218,569 @@ uint OptiXDenoiser::get_device_type_mask() const
  return DEVICE_MASK_OPTIX;
 }

+class OptiXDenoiser::DenoiseContext {
+ public:
+  explicit DenoiseContext(OptiXDevice *device, const DenoiseTask &task)
+      : denoise_params(task.params),
+        render_buffers(task.render_buffers),
+        buffer_params(task.buffer_params),
+        guiding_buffer(device, "denoiser guiding passes buffer", true),
+        num_samples(task.num_samples)
+  {
+    num_input_passes = 1;
+    if (denoise_params.use_pass_albedo) {
+      num_input_passes += 1;
+      use_pass_albedo = true;
+      pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO);
+      if (denoise_params.use_pass_normal) {
+        num_input_passes += 1;
+        use_pass_normal = true;
+        pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL);
+      }
+    }
+
+    if (denoise_params.temporally_stable) {
+      prev_output.device_pointer = render_buffers->buffer.device_pointer;
+
+      prev_output.offset = buffer_params.get_pass_offset(PASS_DENOISING_PREVIOUS);
+
+      prev_output.stride = buffer_params.stride;
+      prev_output.pass_stride = buffer_params.pass_stride;
+
+      num_input_passes += 1;
+      use_pass_motion = true;
+      pass_motion = buffer_params.get_pass_offset(PASS_MOTION);
+    }
+
+    use_guiding_passes = (num_input_passes - 1) > 0;
+
+    if (use_guiding_passes) {
+      if (task.allow_inplace_modification) {
+        guiding_params.device_pointer = render_buffers->buffer.device_pointer;
+
+        guiding_params.pass_albedo = pass_denoising_albedo;
+        guiding_params.pass_normal = pass_denoising_normal;
+        guiding_params.pass_flow = pass_motion;
+
+        guiding_params.stride = buffer_params.stride;
+        guiding_params.pass_stride = buffer_params.pass_stride;
+      }
+      else {
+        guiding_params.pass_stride = 0;
+        if (use_pass_albedo) {
+          guiding_params.pass_albedo = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+        if (use_pass_normal) {
+          guiding_params.pass_normal = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+        if (use_pass_motion) {
+          guiding_params.pass_flow = guiding_params.pass_stride;
+          guiding_params.pass_stride += 2;
+        }
+
+        guiding_params.stride = buffer_params.width;
+
+        guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height *
+                                       guiding_params.pass_stride);
+        guiding_params.device_pointer = guiding_buffer.device_pointer;
+      }
+    }
+
+    pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  }
+
+  const DenoiseParams &denoise_params;
+
+  RenderBuffers *render_buffers = nullptr;
+  const BufferParams &buffer_params;
+
+  /* Previous output. */
+  struct {
+    device_ptr device_pointer = 0;
+
+    int offset = PASS_UNUSED;
+
+    int stride = -1;
+    int pass_stride = -1;
+  } prev_output;
+
+  /* Device-side storage of the guiding passes. */
+  device_only_memory<float> guiding_buffer;
+
+  struct {
+    device_ptr device_pointer = 0;
+
+    /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+    int pass_albedo = PASS_UNUSED;
+    int pass_normal = PASS_UNUSED;
+    int pass_flow = PASS_UNUSED;
+
+    int stride = -1;
+    int pass_stride = -1;
+  } guiding_params;
+
+  /* Number of input passes. Including the color and extra auxiliary passes. */
+  int num_input_passes = 0;
+  bool use_guiding_passes = false;
+  bool use_pass_albedo = false;
+  bool use_pass_normal = false;
+  bool use_pass_motion = false;
+
+  int num_samples = 0;
+
+  int pass_sample_count = PASS_UNUSED;
+
+  /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+  int pass_denoising_albedo = PASS_UNUSED;
+  int pass_denoising_normal = PASS_UNUSED;
+  int pass_motion = PASS_UNUSED;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake = false;
+};
+
+class OptiXDenoiser::DenoisePass {
+ public:
+  DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type)
+  {
+    noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY);
+    denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  PassType type;
+
+  int noisy_offset;
+  int denoised_offset;
+
+  int num_components;
+  bool use_compositing;
+  bool use_denoising_albedo;
+};
+
+bool OptiXDenoiser::denoise_buffer(const DenoiseTask &task)
+{
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(denoiser_device_);
+
+  const CUDAContextScope scope(optix_device);
+
+  DenoiseContext context(optix_device, task);
+
+  if (!denoise_ensure(context)) {
+    return false;
+  }
+
+  if (!denoise_filter_guiding_preprocess(context)) {
+    LOG(ERROR) << "Error preprocessing guiding passes.";
+    return false;
+  }
+
+  /* Passes which will use real albedo when it is available. */
+  denoise_pass(context, PASS_COMBINED);
+  denoise_pass(context, PASS_SHADOW_CATCHER_MATTE);
+
+  /* Passes which do not need albedo and hence if real is present it needs to become fake. */
+  denoise_pass(context, PASS_SHADOW_CATCHER);
+
+  return true;
+}
+
+bool OptiXDenoiser::denoise_filter_guiding_preprocess(const DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  DeviceKernelArguments args(&context.guiding_params.device_pointer,
+                             &context.guiding_params.pass_stride,
+                             &context.guiding_params.pass_albedo,
+                             &context.guiding_params.pass_normal,
+                             &context.guiding_params.pass_flow,
+                             &context.render_buffers->buffer.device_pointer,
+                             &buffer_params.offset,
+                             &buffer_params.stride,
+                             &buffer_params.pass_stride,
+                             &context.pass_sample_count,
+                             &context.pass_denoising_albedo,
+                             &context.pass_denoising_normal,
+                             &context.pass_motion,
+                             &buffer_params.full_x,
+                             &buffer_params.full_y,
+                             &buffer_params.width,
+                             &buffer_params.height,
+                             &context.num_samples);
+
+  return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
+}
+
+bool OptiXDenoiser::denoise_filter_guiding_set_fake_albedo(const DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  DeviceKernelArguments args(&context.guiding_params.device_pointer,
+                             &context.guiding_params.pass_stride,
+                             &context.guiding_params.pass_albedo,
+                             &buffer_params.width,
+                             &buffer_params.height);
+
+  return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
+}
+
+void OptiXDenoiser::denoise_pass(DenoiseContext &context, PassType pass_type)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const DenoisePass pass(pass_type, buffer_params);
+
+  if (pass.noisy_offset == PASS_UNUSED) {
+    return;
+  }
+  if (pass.denoised_offset == PASS_UNUSED) {
+    LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+    return;
+  }
+
+  if (pass.use_denoising_albedo) {
+    if (context.albedo_replaced_with_fake) {
+      LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+      return;
+    }
+  }
+  else if (context.use_guiding_passes && !context.albedo_replaced_with_fake) {
+    context.albedo_replaced_with_fake = true;
+    if (!denoise_filter_guiding_set_fake_albedo(context)) {
+      LOG(ERROR) << "Error replacing real albedo with the fake one.";
+      return;
+    }
+  }
+
+  /* Read and preprocess noisy color input pass. */
+  denoise_color_read(context, pass);
+  if (!denoise_filter_color_preprocess(context, pass)) {
+    LOG(ERROR) << "Error converting denoising passes to RGB buffer.";
+    return;
+  }
+
+  if (!denoise_run(context, pass)) {
+    LOG(ERROR) << "Error running OptiX denoiser.";
+    return;
+  }
+
+  /* Store result in the combined pass of the render buffer.
+   *
+   * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */
+  if (!denoise_filter_color_postprocess(context, pass)) {
+    LOG(ERROR) << "Error copying denoiser result to the denoised pass.";
+    return;
+  }
+
+  denoiser_queue_->synchronize();
+}
+
+void OptiXDenoiser::denoise_color_read(const DenoiseContext &context, const DenoisePass &pass)
+{
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = pass.type;
+  pass_access_info.mode = PassMode::NOISY;
+  pass_access_info.offset = pass.noisy_offset;
+
+  /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+   * on the approximation. The latter is not even possible because OptiX does not support
+   * denoising of semi-transparent pixels. */
+  pass_access_info.use_approximate_shadow_catcher = false;
+  pass_access_info.use_approximate_shadow_catcher_background = false;
+  pass_access_info.show_active_pixels = false;
+
+  /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases.
+   */
+  const PassAccessorGPU pass_accessor(
+      denoiser_queue_.get(), pass_access_info, 1.0f, context.num_samples);
+
+  PassAccessor::Destination destination(pass_access_info.type);
+  destination.d_pixels = context.render_buffers->buffer.device_pointer +
+                         pass.denoised_offset * sizeof(float);
+  destination.num_components = 3;
+  destination.pixel_stride = context.buffer_params.pass_stride;
+
+  BufferParams buffer_params = context.buffer_params;
+  buffer_params.window_x = 0;
+  buffer_params.window_y = 0;
+  buffer_params.window_width = buffer_params.width;
+  buffer_params.window_height = buffer_params.height;
+
+  pass_accessor.get_render_tile_pixels(context.render_buffers, buffer_params, destination);
+}
+
+bool OptiXDenoiser::denoise_filter_color_preprocess(const DenoiseContext &context,
+                                                    const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer,
+                             &buffer_params.full_x,
+                             &buffer_params.full_y,
+                             &buffer_params.width,
+                             &buffer_params.height,
+                             &buffer_params.offset,
+                             &buffer_params.stride,
+                             &buffer_params.pass_stride,
+                             &pass.denoised_offset);
+
+  return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
+}
+
+bool OptiXDenoiser::denoise_filter_color_postprocess(const DenoiseContext &context,
+                                                     const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer,
+                             &buffer_params.full_x,
+                             &buffer_params.full_y,
+                             &buffer_params.width,
+                             &buffer_params.height,
+                             &buffer_params.offset,
+                             &buffer_params.stride,
+                             &buffer_params.pass_stride,
+                             &context.num_samples,
+                             &pass.noisy_offset,
+                             &pass.denoised_offset,
+                             &context.pass_sample_count,
+                             &pass.num_components,
+                             &pass.use_compositing);
+
+  return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
+}
+
+bool OptiXDenoiser::denoise_ensure(DenoiseContext &context)
+{
+  if (!denoise_create_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser creation has failed.";
+    return false;
+  }
+
+  if (!denoise_configure_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser configuration has failed.";
+    return false;
+  }
+
+  return true;
+}
+
+bool OptiXDenoiser::denoise_create_if_needed(DenoiseContext &context)
+{
+  const bool recreate_denoiser = (optix_denoiser_ == nullptr) ||
+                                 (use_pass_albedo_ != context.use_pass_albedo) ||
+                                 (use_pass_normal_ != context.use_pass_normal) ||
+                                 (use_pass_motion_ != context.use_pass_motion);
+  if (!recreate_denoiser) {
+    return true;
+  }
+
+  /* Destroy existing handle before creating new one. */
+  if (optix_denoiser_) {
+    optixDenoiserDestroy(optix_denoiser_);
+  }
+
+  /* Create OptiX denoiser handle on demand when it is first used. */
+  OptixDenoiserOptions denoiser_options = {};
+  denoiser_options.guideAlbedo = context.use_pass_albedo;
+  denoiser_options.guideNormal = context.use_pass_normal;
+
+  OptixDenoiserModelKind model = OPTIX_DENOISER_MODEL_KIND_HDR;
+  if (context.use_pass_motion) {
+    model = OPTIX_DENOISER_MODEL_KIND_TEMPORAL;
+  }
+
+  const OptixResult result = optixDenoiserCreate(
+      static_cast<OptiXDevice *>(denoiser_device_)->context,
+      model,
+      &denoiser_options,
+      &optix_denoiser_);
+
+  if (result != OPTIX_SUCCESS) {
+    denoiser_device_->set_error("Failed to create OptiX denoiser");
+    return false;
+  }
+
+  /* OptiX denoiser handle was created with the requested number of input passes. */
+  use_pass_albedo_ = context.use_pass_albedo;
+  use_pass_normal_ = context.use_pass_normal;
+  use_pass_motion_ = context.use_pass_motion;
+
+  /* OptiX denoiser has been created, but it needs configuration. */
+  is_configured_ = false;
+
+  return true;
+}
+
+bool OptiXDenoiser::denoise_configure_if_needed(DenoiseContext &context)
+{
+  /* Limit maximum tile size denoiser can be invoked with. */
+  const int2 tile_size = make_int2(min(context.buffer_params.width, 4096),
+                                   min(context.buffer_params.height, 4096));
+
+  if (is_configured_ && (configured_size_.x == tile_size.x && configured_size_.y == tile_size.y)) {
+    return true;
+  }
+
+  optix_device_assert(
+      denoiser_device_,
+      optixDenoiserComputeMemoryResources(optix_denoiser_, tile_size.x, tile_size.y, &sizes_));
+
+  /* Allocate denoiser state if tile size has changed since last setup. */
+  state_.device = denoiser_device_;
+  state_.alloc_to_device(sizes_.stateSizeInBytes + sizes_.withOverlapScratchSizeInBytes);
+
+  /* Initialize denoiser state for the current tile size. */
+  const OptixResult result = optixDenoiserSetup(
+      optix_denoiser_,
+      0, /* Work around bug in r495 drivers that causes artifacts when denoiser setup is called
+          * on a stream that is not the default stream. */
+      tile_size.x + sizes_.overlapWindowSizeInPixels * 2,
+      tile_size.y + sizes_.overlapWindowSizeInPixels * 2,
+      state_.device_pointer,
+      sizes_.stateSizeInBytes,
+      state_.device_pointer + sizes_.stateSizeInBytes,
+      sizes_.withOverlapScratchSizeInBytes);
+  if (result != OPTIX_SUCCESS) {
+    denoiser_device_->set_error("Failed to set up OptiX denoiser");
+    return false;
+  }
+
+  cuda_device_assert(denoiser_device_, cuCtxSynchronize());
+
+  is_configured_ = true;
+  configured_size_ = tile_size;
+
+  return true;
+}
+
+bool OptiXDenoiser::denoise_run(const DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+  const int width = buffer_params.width;
+  const int height = buffer_params.height;
+
+  /* Set up input and output layer information. */
+  OptixImage2D color_layer = {0};
+  OptixImage2D albedo_layer = {0};
+  OptixImage2D normal_layer = {0};
+  OptixImage2D flow_layer = {0};
+
+  OptixImage2D output_layer = {0};
+  OptixImage2D prev_output_layer = {0};
+
+  /* Color pass. */
+  {
+    const int pass_denoised = pass.denoised_offset;
+    const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
+
+    color_layer.data = context.render_buffers->buffer.device_pointer +
+                       pass_denoised * sizeof(float);
+    color_layer.width = width;
+    color_layer.height = height;
+    color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride;
+    color_layer.pixelStrideInBytes = pass_stride_in_bytes;
+    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+  }
+
+  /* Previous output. */
+  if (context.prev_output.offset != PASS_UNUSED) {
+    const int64_t pass_stride_in_bytes = context.prev_output.pass_stride * sizeof(float);
+
+    prev_output_layer.data = context.prev_output.device_pointer +
+                             context.prev_output.offset * sizeof(float);
+    prev_output_layer.width = width;
+    prev_output_layer.height = height;
+    prev_output_layer.rowStrideInBytes = pass_stride_in_bytes * context.prev_output.stride;
+    prev_output_layer.pixelStrideInBytes = pass_stride_in_bytes;
+    prev_output_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+  }
+
+  /* Optional albedo and color passes. */
+  if (context.num_input_passes > 1) {
+    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
+    const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float);
+    const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes;
+
+    if (context.use_pass_albedo) {
+      albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float);
+      albedo_layer.width = width;
+      albedo_layer.height = height;
+      albedo_layer.rowStrideInBytes = row_stride_in_bytes;
+      albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+
+    if (context.use_pass_normal) {
+      normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float);
+      normal_layer.width = width;
+      normal_layer.height = height;
+      normal_layer.rowStrideInBytes = row_stride_in_bytes;
+      normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+
+    if (context.use_pass_motion) {
+      flow_layer.data = d_guiding_buffer + context.guiding_params.pass_flow * sizeof(float);
+      flow_layer.width = width;
+      flow_layer.height = height;
+      flow_layer.rowStrideInBytes = row_stride_in_bytes;
+      flow_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      flow_layer.format = OPTIX_PIXEL_FORMAT_FLOAT2;
+    }
+  }
+
+  /* Denoise in-place of the noisy input in the render buffers. */
+  output_layer = color_layer;
+
+  OptixDenoiserGuideLayer guide_layers = {};
+  guide_layers.albedo = albedo_layer;
+  guide_layers.normal = normal_layer;
+  guide_layers.flow = flow_layer;
+
+  OptixDenoiserLayer image_layers = {};
+  image_layers.input = color_layer;
+  image_layers.previousOutput = prev_output_layer;
+  image_layers.output = output_layer;
+
+  /* Finally run denoising. */
+  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+
+  optix_device_assert(denoiser_device_,
+                      ccl::optixUtilDenoiserInvokeTiled(
+                          optix_denoiser_,
+                          static_cast<OptiXDeviceQueue *>(denoiser_queue_.get())->stream(),
+                          &params,
+                          state_.device_pointer,
+                          sizes_.stateSizeInBytes,
+                          &guide_layers,
+                          &image_layers,
+                          1,
+                          state_.device_pointer + sizes_.stateSizeInBytes,
+                          sizes_.withOverlapScratchSizeInBytes,
+                          sizes_.overlapWindowSizeInPixels,
+                          configured_size_.x,
+                          configured_size_.y));
+
+  return true;
+}
+
 CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/integrator/denoiser_optix.h
+++ b/intern/cycles/integrator/denoiser_optix.h
@@ -3,16 +3,84 @@

 #pragma once

-#include "integrator/denoiser_device.h"
+#ifdef WITH_OPTIX
+
+#  include "integrator/denoiser_gpu.h"
+
+#  include "device/optix/util.h"

 CCL_NAMESPACE_BEGIN

-class OptiXDenoiser : public DeviceDenoiser {
+/* Implementation of denoising API which uses the OptiX denoiser. */
+class OptiXDenoiser : public DenoiserGPU {
 public:
  OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params);
+  ~OptiXDenoiser();

 protected:
  virtual uint get_device_type_mask() const override;
+
+ private:
+  class DenoiseContext;
+  class DenoisePass;
+
+  virtual bool denoise_buffer(const DenoiseTask &task) override;
+
+  /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
+   * OptiX and store in the guiding passes memory within the given context.
+   *
+   * Pre-processing of the guiding passes is to only happen once per context lifetime. DO not
+   * preprocess them for every pass which is being denoised. */
+  bool denoise_filter_guiding_preprocess(const DenoiseContext &context);
+
+  /* Set fake albedo pixels in the albedo guiding pass storage.
+   * After this point only passes which do not need albedo for denoising can be processed. */
+  bool denoise_filter_guiding_set_fake_albedo(const DenoiseContext &context);
+
+  void denoise_pass(DenoiseContext &context, PassType pass_type);
+
+  /* Read input color pass from the render buffer into the memory which corresponds to the noisy
+   * input within the given context. Pixels are scaled to the number of samples, but are not
+   * preprocessed yet. */
+  void denoise_color_read(const DenoiseContext &context, const DenoisePass &pass);
+
+  /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
+   * denoiser result to the render buffer. */
+  bool denoise_filter_color_preprocess(const DenoiseContext &context, const DenoisePass &pass);
+  bool denoise_filter_color_postprocess(const DenoiseContext &context, const DenoisePass &pass);
+
+  /* Make sure the OptiX denoiser is created and configured. */
+  bool denoise_ensure(DenoiseContext &context);
+
+  /* Create OptiX denoiser descriptor if needed.
+   * Will do nothing if the current OptiX descriptor is usable for the given parameters.
+   * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
+  bool denoise_create_if_needed(DenoiseContext &context);
+
+  /* Configure existing OptiX denoiser descriptor for the use for the given task. */
+  bool denoise_configure_if_needed(DenoiseContext &context);
+
+  /* Run configured denoiser. */
+  bool denoise_run(const DenoiseContext &context, const DenoisePass &pass);
+
+  OptixDenoiser optix_denoiser_ = nullptr;
+
+  /* Configuration size, as provided to `optixDenoiserSetup`.
+   * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
+   * `is_configured` will be false. */
+  bool is_configured_ = false;
+  int2 configured_size_ = make_int2(0, 0);
+
+  /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
+   * The memory layout goes as following: [denoiser state][scratch buffer]. */
+  device_only_memory<unsigned char> state_;
+  OptixDenoiserSizes sizes_ = {};
+
+  bool use_pass_albedo_ = false;
+  bool use_pass_normal_ = false;
+  bool use_pass_motion_ = false;
 };

 CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -37,6 +37,14 @@ set(SRC_KERNEL_DEVICE_OPTIX
  device/optix/kernel_shader_raytrace.cu
 )

+if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+  set(SRC_KERNEL_DEVICE_OPTIX
+    ${SRC_KERNEL_DEVICE_OPTIX}
+    osl/services_optix.cu
+    device/optix/kernel_osl.cu
+  )
+endif()
+
 set(SRC_KERNEL_DEVICE_ONEAPI
  device/oneapi/kernel.cpp
 )
@@ -181,6 +189,16 @@ set(SRC_KERNEL_SVM_HEADERS
  svm/vertex_color.h
 )

+if(WITH_CYCLES_OSL)
+  set(SRC_KERNEL_OSL_HEADERS
+    osl/osl.h
+    osl/closures_setup.h
+    osl/closures_template.h
+    osl/services_gpu.h
+    osl/types.h
+  )
+endif()
+
 set(SRC_KERNEL_GEOM_HEADERS
  geom/geom.h
  geom/attribute.h
@@ -267,10 +285,17 @@ set(SRC_KERNEL_INTEGRATOR_HEADERS
 )

 set(SRC_KERNEL_LIGHT_HEADERS
-  light/light.h
+  light/area.h
  light/background.h
  light/common.h
+  light/distant.h
+  light/distribution.h
+  light/light.h
+  light/point.h
  light/sample.h
+  light/spot.h
+  light/tree.h
+  light/triangle.h
 )

 set(SRC_KERNEL_SAMPLE_HEADERS
@@ -306,6 +331,7 @@ set(SRC_KERNEL_HEADERS
  ${SRC_KERNEL_GEOM_HEADERS}
  ${SRC_KERNEL_INTEGRATOR_HEADERS}
  ${SRC_KERNEL_LIGHT_HEADERS}
+  ${SRC_KERNEL_OSL_HEADERS}
  ${SRC_KERNEL_SAMPLE_HEADERS}
  ${SRC_KERNEL_SVM_HEADERS}
  ${SRC_KERNEL_TYPES_HEADERS}
@@ -328,6 +354,7 @@ set(SRC_UTIL_HEADERS
  ../util/math_int2.h
  ../util/math_int3.h
  ../util/math_int4.h
+  ../util/math_int8.h
  ../util/math_matrix.h
  ../util/projection.h
  ../util/rect.h
@@ -350,6 +377,8 @@ set(SRC_UTIL_HEADERS
  ../util/types_int3_impl.h
  ../util/types_int4.h
  ../util/types_int4_impl.h
+  ../util/types_int8.h
+  ../util/types_int8_impl.h
  ../util/types_spectrum.h
  ../util/types_uchar2.h
  ../util/types_uchar2_impl.h
@@ -444,6 +473,7 @@ if(WITH_CYCLES_CUDA_BINARIES)

    if(WITH_CYCLES_DEBUG)
      set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG)
+      set(cuda_flags ${cuda_flags} --ptxas-options="-v")
    endif()

    set(_cuda_nvcc_args
@@ -451,7 +481,6 @@ if(WITH_CYCLES_CUDA_BINARIES)
          ${CUDA_NVCC_FLAGS}
          --${format}
          ${CMAKE_CURRENT_SOURCE_DIR}${cuda_kernel_src}
-          --ptxas-options="-v"
          ${cuda_flags})

    if(WITH_COMPILER_CCACHE AND CCACHE_PROGRAM)
@@ -660,6 +689,16 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
    kernel_optix_shader_raytrace
    "device/optix/kernel_shader_raytrace.cu"
    "--keep-device-functions")
+  if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+    CYCLES_OPTIX_KERNEL_ADD(
+      kernel_optix_osl
+      "device/optix/kernel_osl.cu"
+      "--relocatable-device-code=true")
+    CYCLES_OPTIX_KERNEL_ADD(
+      kernel_optix_osl_services
+      "osl/services_optix.cu"
+      "--relocatable-device-code=true")
+  endif()

  add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
  cycles_set_solution_folder(cycles_kernel_optix)
@@ -947,6 +986,7 @@ source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_KERNEL_TYPES_HEADERS})
 source_group("light" FILES ${SRC_KERNEL_LIGHT_HEADERS})
+source_group("osl" FILES ${SRC_KERNEL_OSL_HEADERS})
 source_group("sample" FILES ${SRC_KERNEL_SAMPLE_HEADERS})
 source_group("svm" FILES ${SRC_KERNEL_SVM_HEADERS})
 source_group("util" FILES ${SRC_KERNEL_UTIL_HEADERS})
@@ -983,6 +1023,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLE
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_LIGHT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/light)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_OSL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/osl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SAMPLE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/sample)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_TYPES_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -297,8 +297,10 @@ ccl_device_inline void bsdf_roughness_eta(const KernelGlobals kg,
                                          ccl_private float2 *roughness,
                                          ccl_private float *eta)
 {
+#ifdef __SVM__
  bool refractive = false;
  float alpha = 1.0f;
+#endif
  switch (sc->type) {
    case CLOSURE_BSDF_DIFFUSE_ID:
      *roughness = one_float2();
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -69,7 +69,7 @@ ccl_device int bsdf_diffuse_sample(ccl_private const ShaderClosure *sc,
 ccl_device int bsdf_translucent_setup(ccl_private DiffuseBsdf *bsdf)
 {
  bsdf->type = CLOSURE_BSDF_TRANSLUCENT_ID;
-  return SD_BSDF | SD_BSDF_HAS_EVAL;
+  return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_HAS_TRANSMISSION;
 }

 ccl_device Spectrum bsdf_translucent_eval(ccl_private const ShaderClosure *sc,
--- a/intern/cycles/kernel/closure/bsdf_hair.h
+++ b/intern/cycles/kernel/closure/bsdf_hair.h
@@ -34,7 +34,7 @@ ccl_device int bsdf_hair_transmission_setup(ccl_private HairBsdf *bsdf)
  bsdf->type = CLOSURE_BSDF_HAIR_TRANSMISSION_ID;
  bsdf->roughness1 = clamp(bsdf->roughness1, 0.001f, 1.0f);
  bsdf->roughness2 = clamp(bsdf->roughness2, 0.001f, 1.0f);
-  return SD_BSDF | SD_BSDF_HAS_EVAL;
+  return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_HAS_TRANSMISSION;
 }

 ccl_device Spectrum bsdf_hair_reflection_eval(ccl_private const ShaderClosure *sc,
--- a/intern/cycles/kernel/closure/bsdf_hair_principled.h
+++ b/intern/cycles/kernel/closure/bsdf_hair_principled.h
@@ -196,7 +196,7 @@ ccl_device int bsdf_principled_hair_setup(ccl_private ShaderData *sd,

  bsdf->extra->geom = make_float4(Y.x, Y.y, Y.z, h);

-  return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
+  return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG | SD_BSDF_HAS_TRANSMISSION;
 }

 #endif /* __HAIR__ */
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -346,7 +346,7 @@ ccl_device int bsdf_microfacet_ggx_refraction_setup(ccl_private MicrofacetBsdf *

  bsdf->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;

-  return SD_BSDF | SD_BSDF_HAS_EVAL;
+  return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_HAS_TRANSMISSION;
 }

 ccl_device void bsdf_microfacet_ggx_blur(ccl_private ShaderClosure *sc, float roughness)
@@ -776,7 +776,7 @@ ccl_device int bsdf_microfacet_beckmann_refraction_setup(ccl_private MicrofacetB
  bsdf->alpha_y = bsdf->alpha_x;

  bsdf->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
-  return SD_BSDF | SD_BSDF_HAS_EVAL;
+  return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_HAS_TRANSMISSION;
 }

 ccl_device void bsdf_microfacet_beckmann_blur(ccl_private ShaderClosure *sc, float roughness)
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -559,7 +559,7 @@ ccl_device int bsdf_microfacet_multi_ggx_glass_setup(ccl_private MicrofacetBsdf

  bsdf->type = CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID;

-  return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG;
+  return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_NEEDS_LCG | SD_BSDF_HAS_TRANSMISSION;
 }

 ccl_device int bsdf_microfacet_multi_ggx_glass_fresnel_setup(ccl_private MicrofacetBsdf *bsdf,
--- a/intern/cycles/kernel/data_arrays.h
+++ b/intern/cycles/kernel/data_arrays.h
@@ -60,6 +60,13 @@ KERNEL_DATA_ARRAY(KernelLight, lights)
 KERNEL_DATA_ARRAY(float2, light_background_marginal_cdf)
 KERNEL_DATA_ARRAY(float2, light_background_conditional_cdf)

+/* light tree */
+KERNEL_DATA_ARRAY(KernelLightTreeNode, light_tree_nodes)
+KERNEL_DATA_ARRAY(KernelLightTreeEmitter, light_tree_emitters)
+KERNEL_DATA_ARRAY(uint, light_to_tree)
+KERNEL_DATA_ARRAY(uint, object_lookup_offset)
+KERNEL_DATA_ARRAY(uint, triangle_to_tree)
+
 /* particles */
 KERNEL_DATA_ARRAY(KernelParticle, particles)

--- a/intern/cycles/kernel/data_template.h
+++ b/intern/cycles/kernel/data_template.h
@@ -23,24 +23,19 @@ KERNEL_STRUCT_MEMBER(background, int, volume_shader)
 KERNEL_STRUCT_MEMBER(background, float, volume_step_size)
 KERNEL_STRUCT_MEMBER(background, int, transparent)
 KERNEL_STRUCT_MEMBER(background, float, transparent_roughness_squared_threshold)
-/* Portal sampling. */
-KERNEL_STRUCT_MEMBER(background, float, portal_weight)
-KERNEL_STRUCT_MEMBER(background, int, num_portals)
-KERNEL_STRUCT_MEMBER(background, int, portal_offset)
 /* Sun sampling. */
 KERNEL_STRUCT_MEMBER(background, float, sun_weight)
 /* Importance map sampling. */
 KERNEL_STRUCT_MEMBER(background, float, map_weight)
+KERNEL_STRUCT_MEMBER(background, float, portal_weight)
 KERNEL_STRUCT_MEMBER(background, int, map_res_x)
 KERNEL_STRUCT_MEMBER(background, int, map_res_y)
 /* Multiple importance sampling. */
 KERNEL_STRUCT_MEMBER(background, int, use_mis)
 /* Lightgroup. */
 KERNEL_STRUCT_MEMBER(background, int, lightgroup)
-/* Padding. */
-KERNEL_STRUCT_MEMBER(background, int, pad1)
-KERNEL_STRUCT_MEMBER(background, int, pad2)
-KERNEL_STRUCT_MEMBER(background, int, pad3)
+/* Light Index. */
+KERNEL_STRUCT_MEMBER(background, int, light_index)
 KERNEL_STRUCT_END(KernelBackground)

 /* BVH: own BVH2 if no native device acceleration struct used. */
@@ -102,8 +97,6 @@ KERNEL_STRUCT_MEMBER(film, int, pass_emission)
 KERNEL_STRUCT_MEMBER(film, int, pass_background)
 KERNEL_STRUCT_MEMBER(film, int, pass_ao)
 KERNEL_STRUCT_MEMBER(film, float, pass_alpha_threshold)
-KERNEL_STRUCT_MEMBER(film, int, pass_shadow)
-KERNEL_STRUCT_MEMBER(film, float, pass_shadow_scale)
 KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher)
 KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_sample_count)
 KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_matte)
@@ -137,9 +130,6 @@ KERNEL_STRUCT_MEMBER(film, int, use_approximate_shadow_catcher)
 KERNEL_STRUCT_MEMBER(film, int, pass_guiding_color)
 KERNEL_STRUCT_MEMBER(film, int, pass_guiding_probability)
 KERNEL_STRUCT_MEMBER(film, int, pass_guiding_avg_roughness)
-/* Padding. */
-KERNEL_STRUCT_MEMBER(film, int, pad1)
-KERNEL_STRUCT_MEMBER(film, int, pad2)
 KERNEL_STRUCT_END(KernelFilm)

 /* Integrator. */
@@ -147,10 +137,18 @@ KERNEL_STRUCT_END(KernelFilm)
 KERNEL_STRUCT_BEGIN(KernelIntegrator, integrator)
 /* Emission. */
 KERNEL_STRUCT_MEMBER(integrator, int, use_direct_light)
+KERNEL_STRUCT_MEMBER(integrator, int, use_light_mis)
+KERNEL_STRUCT_MEMBER(integrator, int, use_light_tree)
+KERNEL_STRUCT_MEMBER(integrator, int, num_lights)
+KERNEL_STRUCT_MEMBER(integrator, int, num_distant_lights)
+KERNEL_STRUCT_MEMBER(integrator, int, num_background_lights)
+/* Portal sampling. */
+KERNEL_STRUCT_MEMBER(integrator, int, num_portals)
+KERNEL_STRUCT_MEMBER(integrator, int, portal_offset)
+/* Flat light distribution. */
 KERNEL_STRUCT_MEMBER(integrator, int, num_distribution)
-KERNEL_STRUCT_MEMBER(integrator, int, num_all_lights)
-KERNEL_STRUCT_MEMBER(integrator, float, pdf_triangles)
-KERNEL_STRUCT_MEMBER(integrator, float, pdf_lights)
+KERNEL_STRUCT_MEMBER(integrator, float, distribution_pdf_triangles)
+KERNEL_STRUCT_MEMBER(integrator, float, distribution_pdf_lights)
 KERNEL_STRUCT_MEMBER(integrator, float, light_inv_rr_threshold)
 /* Bounces. */
 KERNEL_STRUCT_MEMBER(integrator, int, min_bounce)
@@ -177,8 +175,6 @@ KERNEL_STRUCT_MEMBER(integrator, int, seed)
 /* Clamp. */
 KERNEL_STRUCT_MEMBER(integrator, float, sample_clamp_direct)
 KERNEL_STRUCT_MEMBER(integrator, float, sample_clamp_indirect)
-/* MIS. */
-KERNEL_STRUCT_MEMBER(integrator, int, use_lamp_mis)
 /* Caustics. */
 KERNEL_STRUCT_MEMBER(integrator, int, use_caustics)
 /* Sampling pattern. */
@@ -195,7 +191,6 @@ KERNEL_STRUCT_MEMBER(integrator, int, has_shadow_catcher)
 KERNEL_STRUCT_MEMBER(integrator, int, filter_closures)
 /* MIS debugging. */
 KERNEL_STRUCT_MEMBER(integrator, int, direct_light_sampling_type)
-
 /* Path Guiding */
 KERNEL_STRUCT_MEMBER(integrator, float, surface_guiding_probability)
 KERNEL_STRUCT_MEMBER(integrator, float, volume_guiding_probability)
@@ -210,7 +205,6 @@ KERNEL_STRUCT_MEMBER(integrator, int, use_guiding_mis_weights)
 /* Padding. */
 KERNEL_STRUCT_MEMBER(integrator, int, pad1)
 KERNEL_STRUCT_MEMBER(integrator, int, pad2)
-KERNEL_STRUCT_MEMBER(integrator, int, pad3)
 KERNEL_STRUCT_END(KernelIntegrator)

 /* SVM. For shader specialization. */
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -7,6 +7,7 @@
 * one with SSE2 intrinsics.
 */
 #if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
 #endif

@@ -29,11 +30,15 @@
 #    define __KERNEL_SSE41__
 #  endif
 #  ifdef __AVX__
-#    define __KERNEL_SSE__
+#    ifndef __KERNEL_SSE__
+#      define __KERNEL_SSE__
+#    endif
 #    define __KERNEL_AVX__
 #  endif
 #  ifdef __AVX2__
-#    define __KERNEL_SSE__
+#    ifndef __KERNEL_SSE__
+#      define __KERNEL_SSE__
+#    endif
 #    define __KERNEL_AVX2__
 #  endif
 #endif
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -30,6 +30,7 @@ typedef unsigned long long uint64_t;
 /* Qualifiers */

 #define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
 #if __CUDA_ARCH__ < 500
 #  define ccl_device_inline __device__ __forceinline__
 #  define ccl_device_forceinline __device__ __forceinline__
@@ -109,14 +110,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D

 typedef unsigned short half;

-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
 {
  half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
  return val;
 }

-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
 {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -314,11 +314,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                        int kernel_index);
  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;

-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix

@@ -333,11 +329,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                        int kernel_index);
  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;

-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix

@@ -349,11 +341,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
 {
  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);

-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix

@@ -366,11 +354,8 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
 {
  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);

-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices + indices_offset,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix

@@ -383,11 +368,8 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
 {
  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);

-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices + indices_offset,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix

@@ -431,11 +413,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                        int num_active_paths);
  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;

-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix

@@ -469,11 +447,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                        int num_active_paths);
  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;

-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix

--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -56,7 +56,7 @@ void gpu_parallel_active_index_array_impl(const uint num_states,
  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 #else /* !__KERNEL__ONEAPI__ */
 #  ifndef __KERNEL_METAL__
-template<uint blocksize, typename IsActiveOp>
+template<typename IsActiveOp>
 __device__
 #  endif
    void
@@ -79,6 +79,10 @@ __device__
 {
  extern ccl_gpu_shared int warp_offset[];

+#    ifndef __KERNEL_METAL__
+  const uint blocksize = ccl_gpu_block_dim_x;
+#    endif
+
  const uint thread_index = ccl_gpu_thread_idx_x;
  const uint thread_warp = thread_index % ccl_gpu_warp_size;

@@ -149,7 +153,7 @@ __device__

 #ifdef __KERNEL_METAL__

-#  define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \
+#  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
    const uint is_active = (ccl_gpu_global_id_x() < num_states) ? \
                               is_active_op(ccl_gpu_global_id_x()) : \
                               0; \
@@ -167,15 +171,13 @@ __device__
                                         simdgroup_offset)
 #elif defined(__KERNEL_ONEAPI__)

-#  define gpu_parallel_active_index_array( \
-      blocksize, num_states, indices, num_indices, is_active_op) \
+#  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
    gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)

 #else

-#  define gpu_parallel_active_index_array( \
-      blocksize, num_states, indices, num_indices, is_active_op) \
-    gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op)
+#  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
+    gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)

 #endif

--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -28,6 +28,7 @@ typedef unsigned long long uint64_t;
 /* Qualifiers */

 #define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
 #define ccl_device_inline __device__ __inline__
 #define ccl_device_forceinline __device__ __forceinline__
 #define ccl_device_noinline __device__ __noinline__
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -38,6 +38,7 @@ using namespace metal::raytracing;
 #  define ccl_device_noinline ccl_device __attribute__((noinline))
 #endif

+#define ccl_device_extern extern "C"
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_device_inline_method ccl_device
 #define ccl_global device
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -28,6 +28,7 @@
 /* Qualifier wrappers for different names on different devices */

 #define ccl_device
+#define ccl_device_extern extern "C"
 #define ccl_global
 #define ccl_always_inline __attribute__((always_inline))
 #define ccl_device_inline inline
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -33,14 +33,16 @@ typedef unsigned long long uint64_t;
 #endif

 #define ccl_device \
-  __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+  static __device__ \
+      __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+#define ccl_device_extern extern "C" __device__
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
-#define ccl_device_inline_method ccl_device
-#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_inline_method __device__ __forceinline__
+#define ccl_device_noinline static __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
-#define ccl_inline_constant __constant__
+#define ccl_inline_constant static __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -57,23 +59,6 @@ typedef unsigned long long uint64_t;

 #define kernel_assert(cond)

-/* GPU thread, block, grid size and index */
-
-#define ccl_gpu_thread_idx_x (threadIdx.x)
-#define ccl_gpu_block_dim_x (blockDim.x)
-#define ccl_gpu_block_idx_x (blockIdx.x)
-#define ccl_gpu_grid_dim_x (gridDim.x)
-#define ccl_gpu_warp_size (warpSize)
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
-#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
-
-/* GPU warp synchronization. */
-
-#define ccl_gpu_syncthreads() __syncthreads()
-#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
-
 /* GPU texture objects */

 typedef unsigned long long CUtexObject;
@@ -101,14 +86,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D

 typedef unsigned short half;

-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
 {
  half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
  return val;
 }

-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
 {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
--- a/intern/cycles/kernel/device/optix/globals.h
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -25,6 +25,7 @@ struct KernelParamsOptiX {
  /* Kernel arguments */
  const int *path_index_array;
  float *render_buffer;
+  int offset;

  /* Global scene data and textures */
  KernelData data;
@@ -36,7 +37,11 @@ struct KernelParamsOptiX {
 };

 #ifdef __NVCC__
-extern "C" static __constant__ KernelParamsOptiX kernel_params;
+extern "C"
+#  ifndef __CUDACC_RDC__
+    static
+#  endif
+    __constant__ KernelParamsOptiX kernel_params;
 #endif

 /* Abstraction macros */
--- a/intern/cycles/kernel/device/optix/kernel_osl.cu
+++ b/intern/cycles/kernel/device/optix/kernel_osl.cu
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+/* Copy of the regular OptiX kernels with additional OSL support. */
+
+#include "kernel/device/optix/kernel_shader_raytrace.cu"
+
+#include "kernel/bake/bake.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_volume.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_background()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_background(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_light()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_light(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_surface(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_volume()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_volume(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_shadow()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_shadow(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_displace()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_displace_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_background()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_background_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_curve_shadow_transparency()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_curve_shadow_transparency_evaluate(nullptr, input, output, global_index);
+}
--- a/intern/cycles/kernel/film/adaptive_sampling.h
+++ b/intern/cycles/kernel/film/adaptive_sampling.h
@@ -58,13 +58,29 @@ ccl_device bool film_adaptive_sampling_convergence_check(KernelGlobals kg,
  const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined);

  const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
-  const float inv_sample = 1.0f / sample;
+  const float intensity_scale = kernel_data.film.exposure / sample;

  /* The per pixel error as seen in section 2.1 of
   * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */
  const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) *
-                                 inv_sample;
-  const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample);
+                                 intensity_scale;
+  const float intensity = (I.x + I.y + I.z) * intensity_scale;
+
+  /* Anything with R+G+B > 1 is highly exposed - even in sRGB it's a range that
+   * some displays aren't even able to display without significant losses in
+   * detalization. Everything with R+G+B > 3 is overexposed and should receive
+   * even less samples. Filmic-like curves need maximum sampling rate at
+   * intensity near 0.1-0.2, so threshold of 1 for R+G+B leaves an additional
+   * fstop in case it is needed for compositing.
+   */
+  float error_normalize;
+  if (intensity < 1.0f) {
+    error_normalize = sqrtf(intensity);
+  }
+  else {
+    error_normalize = intensity;
+  }
+
  /* A small epsilon is added to the divisor to prevent division by zero. */
  const float error = error_difference / (0.0001f + error_normalize);
  const bool did_converge = (error < threshold);
--- a/intern/cycles/kernel/film/light_passes.h
+++ b/intern/cycles/kernel/film/light_passes.h
@@ -527,17 +527,6 @@ ccl_device_inline void film_write_direct_light(KernelGlobals kg,
        film_write_pass_spectrum(buffer + pass_offset, contribution);
      }
    }
-
-    /* Write shadow pass. */
-    if (kernel_data.film.pass_shadow != PASS_UNUSED && (path_flag & PATH_RAY_SHADOW_FOR_LIGHT) &&
-        (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-      const Spectrum unshadowed_throughput = INTEGRATOR_STATE(
-          state, shadow_path, unshadowed_throughput);
-      const Spectrum shadowed_throughput = INTEGRATOR_STATE(state, shadow_path, throughput);
-      const Spectrum shadow = safe_divide(shadowed_throughput, unshadowed_throughput) *
-                              kernel_data.film.pass_shadow_scale;
-      film_write_pass_spectrum(buffer + kernel_data.film.pass_shadow, shadow);
-    }
  }
 #endif
 }
--- a/intern/cycles/kernel/integrator/displacement_shader.h
+++ b/intern/cycles/kernel/integrator/displacement_shader.h
@@ -24,8 +24,8 @@ ccl_device void displacement_shader_eval(KernelGlobals kg,

  /* this will modify sd->P */
 #ifdef __OSL__
-  if (kg->osl) {
-    OSLShader::eval_displacement(kg, state, sd);
+  if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+    osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(kg, state, sd, 0);
  }
  else
 #endif
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -11,10 +11,10 @@
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/shadow_catcher.h"

-#include "kernel/light/light.h"
-
 #include "kernel/geom/geom.h"

+#include "kernel/light/light.h"
+
 #include "kernel/bvh/bvh.h"

 CCL_NAMESPACE_BEGIN
@@ -387,7 +387,7 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg,
 #endif /* __MNEE__ */

  /* Light intersection for MIS. */
-  if (kernel_data.integrator.use_lamp_mis) {
+  if (kernel_data.integrator.use_light_mis) {
    /* NOTE: if we make lights visible to camera rays, we'll need to initialize
     * these in the path_state_init. */
    const int last_type = INTEGRATOR_STATE(state, isect, type);
--- a/intern/cycles/kernel/integrator/mnee.h
+++ b/intern/cycles/kernel/integrator/mnee.h
@@ -108,48 +108,6 @@ ccl_device_inline float mat22_inverse(const float4 m, ccl_private float4 &m_inve
  return det;
 }

-/* Update light sample */
-ccl_device_forceinline void mnee_update_light_sample(KernelGlobals kg,
-                                                     const float3 P,
-                                                     ccl_private LightSample *ls)
-{
-  /* correct light sample position/direction and pdf
-   * NOTE: preserve pdf in area measure */
-  const ccl_global KernelLight *klight = &kernel_data_fetch(lights, ls->lamp);
-
-  if (ls->type == LIGHT_POINT || ls->type == LIGHT_SPOT) {
-    ls->D = normalize_len(ls->P - P, &ls->t);
-    ls->Ng = -ls->D;
-
-    float2 uv = map_to_sphere(ls->Ng);
-    ls->u = uv.x;
-    ls->v = uv.y;
-
-    float invarea = klight->spot.invarea;
-    ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
-    ls->pdf = invarea;
-
-    if (ls->type == LIGHT_SPOT) {
-      /* spot light attenuation */
-      float3 dir = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
-      ls->eval_fac *= spot_light_attenuation(
-          dir, klight->spot.spot_angle, klight->spot.spot_smooth, ls->Ng);
-    }
-  }
-  else if (ls->type == LIGHT_AREA) {
-    float invarea = fabsf(klight->area.invarea);
-    ls->D = normalize_len(ls->P - P, &ls->t);
-    ls->pdf = invarea;
-    if (klight->area.tan_spread > 0.f) {
-      ls->eval_fac = 0.25f * invarea;
-      ls->eval_fac *= light_spread_attenuation(
-          ls->D, ls->Ng, klight->area.tan_spread, klight->area.normalize_spread);
-    }
-  }
-
-  ls->pdf *= kernel_data.integrator.pdf_lights;
-}
-
 /* Manifold vertex setup from ray and intersection data */
 ccl_device_forceinline void mnee_setup_manifold_vertex(KernelGlobals kg,
                                                       ccl_private ManifoldVertex *vtx,
@@ -819,7 +777,7 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,

  /* Update light sample with new position / direct.ion
   * and keep pdf in vertex area measure */
-  mnee_update_light_sample(kg, vertices[vertex_count - 1].p, ls);
+  light_sample_update_position(kg, ls, vertices[vertex_count - 1].p);

  /* Save state path bounce info in case a light path node is used in the refractive interface or
   * light shader graph. */
--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -91,7 +91,10 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
 #endif
 }

-ccl_device_inline void path_state_next(KernelGlobals kg, IntegratorState state, int label)
+ccl_device_inline void path_state_next(KernelGlobals kg,
+                                       IntegratorState state,
+                                       const int label,
+                                       const int shader_flag)
 {
  uint32_t flag = INTEGRATOR_STATE(state, path, flag);

@@ -120,12 +123,12 @@ ccl_device_inline void path_state_next(KernelGlobals kg, IntegratorState state,
    flag |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
  }

-  flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP);
+  flag &= ~(PATH_RAY_ALL_VISIBILITY | PATH_RAY_MIS_SKIP | PATH_RAY_MIS_HAD_TRANSMISSION);

 #ifdef __VOLUME__
  if (label & LABEL_VOLUME_SCATTER) {
    /* volume scatter */
-    flag |= PATH_RAY_VOLUME_SCATTER;
+    flag |= PATH_RAY_VOLUME_SCATTER | PATH_RAY_MIS_HAD_TRANSMISSION;
    flag &= ~PATH_RAY_TRANSPARENT_BACKGROUND;
    if (!(flag & PATH_RAY_ANY_PASS)) {
      flag |= PATH_RAY_VOLUME_PASS;
@@ -188,6 +191,11 @@ ccl_device_inline void path_state_next(KernelGlobals kg, IntegratorState state,
      flag |= PATH_RAY_GLOSSY | PATH_RAY_SINGULAR | PATH_RAY_MIS_SKIP;
    }

+    /* Flag for consistent MIS weights with light tree. */
+    if (shader_flag & SD_BSDF_HAS_TRANSMISSION) {
+      flag |= PATH_RAY_MIS_HAD_TRANSMISSION;
+    }
+
    /* Render pass categories. */
    if (!(flag & PATH_RAY_ANY_PASS) && !(flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
      flag |= PATH_RAY_SURFACE_PASS;
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -69,9 +69,9 @@ ccl_device_inline void integrate_background(KernelGlobals kg,
  bool eval_background = true;
  float transparent = 0.0f;

+  int path_flag = INTEGRATOR_STATE(state, path, flag);
  const bool is_transparent_background_ray = kernel_data.background.transparent &&
-                                             (INTEGRATOR_STATE(state, path, flag) &
-                                              PATH_RAY_TRANSPARENT_BACKGROUND);
+                                             (path_flag & PATH_RAY_TRANSPARENT_BACKGROUND);

  if (is_transparent_background_ray) {
    transparent = average(INTEGRATOR_STATE(state, path, throughput));
@@ -86,7 +86,7 @@ ccl_device_inline void integrate_background(KernelGlobals kg,
 #ifdef __MNEE__
  if (INTEGRATOR_STATE(state, path, mnee) & PATH_MNEE_CULL_LIGHT_CONNECTION) {
    if (kernel_data.background.use_mis) {
-      for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+      for (int lamp = 0; lamp < kernel_data.integrator.num_lights; lamp++) {
        /* This path should have been resolved with mnee, it will
         * generate a firefly for small lights since it is improbable. */
        const ccl_global KernelLight *klight = &kernel_data_fetch(lights, lamp);
@@ -113,17 +113,10 @@ ccl_device_inline void integrate_background(KernelGlobals kg,

    /* Background MIS weights. */
    float mis_weight = 1.0f;
-    /* Check if background light exists or if we should skip pdf. */
+    /* Check if background light exists or if we should skip PDF. */
    if (!(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_MIS_SKIP) &&
        kernel_data.background.use_mis) {
-      const float3 ray_P = INTEGRATOR_STATE(state, ray, P);
-      const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
-      const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-
-      /* multiple importance sampling, get background light pdf for ray
-       * direction, and compute weight with respect to BSDF pdf */
-      const float pdf = background_light_pdf(kg, ray_P, ray_D);
-      mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
+      mis_weight = light_sample_mis_weight_forward_background(kg, state, path_flag);
    }

    guiding_record_background(kg, state, L, mis_weight);
@@ -142,8 +135,8 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
  const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
  const float ray_time = INTEGRATOR_STATE(state, ray, time);
  LightSample ls ccl_optional_struct_init;
-  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
-    if (light_sample_from_distant_ray(kg, ray_D, lamp, &ls)) {
+  for (int lamp = 0; lamp < kernel_data.integrator.num_lights; lamp++) {
+    if (distant_light_sample_from_intersection(kg, ray_D, lamp, &ls)) {
      /* Use visibility flag to skip lights. */
 #ifdef __PASSES__
      const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
@@ -182,10 +175,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
      /* MIS weighting. */
      float mis_weight = 1.0f;
      if (!(path_flag & PATH_RAY_MIS_SKIP)) {
-        /* multiple importance sampling, get regular light pdf,
-         * and compute weight with respect to BSDF pdf */
-        const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-        mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
+        mis_weight = light_sample_mis_weight_forward_distant(kg, state, path_flag, &ls);
      }

      /* Write to render buffer. */
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -61,10 +61,7 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
  /* MIS weighting. */
  float mis_weight = 1.0f;
  if (!(path_flag & PATH_RAY_MIS_SKIP)) {
-    /* multiple importance sampling, get regular light pdf,
-     * and compute weight with respect to BSDF pdf */
-    const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
+    mis_weight = light_sample_mis_weight_forward_lamp(kg, state, path_flag, &ls, ray_P);
  }

  /* Write to render buffer. */
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -15,7 +15,6 @@
 #include "kernel/integrator/surface_shader.h"
 #include "kernel/integrator/volume_stack.h"

-#include "kernel/light/light.h"
 #include "kernel/light/sample.h"

 CCL_NAMESPACE_BEGIN
@@ -113,20 +112,16 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
  Spectrum L = surface_shader_emission(sd);
  float mis_weight = 1.0f;

+  const bool has_mis = !(path_flag & PATH_RAY_MIS_SKIP) &&
+                       (sd->flag & ((sd->flag & SD_BACKFACING) ? SD_MIS_BACK : SD_MIS_FRONT));
+
 #ifdef __HAIR__
-  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) &&
-      (sd->type & PRIMITIVE_TRIANGLE))
+  if (has_mis && (sd->type & PRIMITIVE_TRIANGLE))
 #else
-  if (!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+  if (has_mis)
 #endif
  {
-    const float bsdf_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    const float t = sd->ray_length;
-
-    /* Multiple importance sampling, get triangle light pdf,
-     * and compute weight with respect to BSDF pdf. */
-    float pdf = triangle_light_pdf(kg, sd, t);
-    mis_weight = light_sample_mis_weight_forward(kg, bsdf_pdf, pdf);
+    mis_weight = light_sample_mis_weight_forward_surface(kg, state, path_flag, sd);
  }

  guiding_record_surface_emission(kg, state, L, mis_weight);
@@ -154,8 +149,17 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
    const uint bounce = INTEGRATOR_STATE(state, path, bounce);
    const float2 rand_light = path_state_rng_2D(kg, rng_state, PRNG_LIGHT);

-    if (!light_distribution_sample_from_position(
-            kg, rand_light.x, rand_light.y, sd->time, sd->P, bounce, path_flag, &ls)) {
+    if (!light_sample_from_position(kg,
+                                    rng_state,
+                                    rand_light.x,
+                                    rand_light.y,
+                                    sd->time,
+                                    sd->P,
+                                    sd->N,
+                                    sd->flag,
+                                    bounce,
+                                    path_flag,
+                                    &ls)) {
      return;
    }
  }
@@ -322,10 +326,6 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,

  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput;

-  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, unshadowed_throughput) = throughput;
-  }
-
  /* Write Lightgroup, +1 as lightgroup is int but we need to encode into a uint8_t. */
  INTEGRATOR_STATE_WRITE(
      shadow_state, shadow_path, lightgroup) = (ls.type != LIGHT_BACKGROUND) ?
@@ -441,11 +441,12 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
  /* Update path state */
  if (!(label & LABEL_TRANSPARENT)) {
    INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = bsdf_pdf;
+    INTEGRATOR_STATE_WRITE(state, path, mis_origin_n) = sd->N;
    INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = fminf(
        unguided_bsdf_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf));
  }

-  path_state_next(kg, state, label);
+  path_state_next(kg, state, label, sd->flag);

  guiding_record_surface_bounce(kg,
                                state,
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -685,14 +685,14 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
 #  endif /* __DENOISING_FEATURES__ */
 }

-/* Path tracing: sample point on light and evaluate light shader, then
- * queue shadow ray to be traced. */
-ccl_device_forceinline bool integrate_volume_sample_light(
+/* Path tracing: sample point on light for equiangular sampling. */
+ccl_device_forceinline bool integrate_volume_equiangular_sample_light(
    KernelGlobals kg,
    IntegratorState state,
+    ccl_private const Ray *ccl_restrict ray,
    ccl_private const ShaderData *ccl_restrict sd,
    ccl_private const RNGState *ccl_restrict rng_state,
-    ccl_private LightSample *ccl_restrict ls)
+    ccl_private float3 *ccl_restrict P)
 {
  /* Test if there is a light or BSDF that needs direct light. */
  if (!kernel_data.integrator.use_direct_light) {
@@ -704,15 +704,30 @@ ccl_device_forceinline bool integrate_volume_sample_light(
  const uint bounce = INTEGRATOR_STATE(state, path, bounce);
  const float2 rand_light = path_state_rng_2D(kg, rng_state, PRNG_LIGHT);

-  if (!light_distribution_sample_from_volume_segment(
-          kg, rand_light.x, rand_light.y, sd->time, sd->P, bounce, path_flag, ls)) {
+  LightSample ls ccl_optional_struct_init;
+  if (!light_sample_from_volume_segment(kg,
+                                        rand_light.x,
+                                        rand_light.y,
+                                        sd->time,
+                                        sd->P,
+                                        ray->D,
+                                        ray->tmax - ray->tmin,
+                                        bounce,
+                                        path_flag,
+                                        &ls)) {
    return false;
  }

-  if (ls->shader & SHADER_EXCLUDE_SCATTER) {
+  if (ls.shader & SHADER_EXCLUDE_SCATTER) {
    return false;
  }

+  if (ls.t == FLT_MAX) {
+    return false;
+  }
+
+  *P = ls.P;
+
  return true;
 }

@@ -728,8 +743,7 @@ ccl_device_forceinline void integrate_volume_direct_light(
 #  ifdef __PATH_GUIDING__
    ccl_private const Spectrum unlit_throughput,
 #  endif
-    ccl_private const Spectrum throughput,
-    ccl_private LightSample *ccl_restrict ls)
+    ccl_private const Spectrum throughput)
 {
  PROFILING_INIT(kg, PROFILING_SHADE_VOLUME_DIRECT_LIGHT);

@@ -737,23 +751,38 @@ ccl_device_forceinline void integrate_volume_direct_light(
    return;
  }

-  /* Sample position on the same light again, now from the shading
-   * point where we scattered.
+  /* Sample position on the same light again, now from the shading point where we scattered.
   *
-   * TODO: decorrelate random numbers and use light_sample_new_position to
-   * avoid resampling the CDF. */
+   * Note that this means we sample the light tree twice when equiangular sampling is used.
+   * We could consider sampling the light tree just once and use the same light position again.
+   *
+   * This would make the PDFs for MIS weights more complicated due to having to account for
+   * both distance/equiangular and direct/indirect light sampling, but could be more accurate.
+   * Additionally we could end up behind the light or outside a spot light cone, which might
+   * waste a sample. Though on the other hand it would be possible to prevent that with
+   * equiangular sampling restricted to a smaller sub-segment where the light has influence. */
+  LightSample ls ccl_optional_struct_init;
  {
    const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
    const uint bounce = INTEGRATOR_STATE(state, path, bounce);
    const float2 rand_light = path_state_rng_2D(kg, rng_state, PRNG_LIGHT);

-    if (!light_distribution_sample_from_position(
-            kg, rand_light.x, rand_light.y, sd->time, P, bounce, path_flag, ls)) {
+    if (!light_sample_from_position(kg,
+                                    rng_state,
+                                    rand_light.x,
+                                    rand_light.y,
+                                    sd->time,
+                                    P,
+                                    zero_float3(),
+                                    SD_BSDF_HAS_TRANSMISSION,
+                                    bounce,
+                                    path_flag,
+                                    &ls)) {
      return;
    }
  }

-  if (ls->shader & SHADER_EXCLUDE_SCATTER) {
+  if (ls.shader & SHADER_EXCLUDE_SCATTER) {
    return;
  }

@@ -765,32 +794,32 @@ ccl_device_forceinline void integrate_volume_direct_light(
   * non-constant light sources. */
  ShaderDataTinyStorage emission_sd_storage;
  ccl_private ShaderData *emission_sd = AS_SHADER_DATA(&emission_sd_storage);
-  const Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, ls, sd->time);
+  const Spectrum light_eval = light_sample_shader_eval(kg, state, emission_sd, &ls, sd->time);
  if (is_zero(light_eval)) {
    return;
  }

  /* Evaluate BSDF. */
  BsdfEval phase_eval ccl_optional_struct_init;
-  float phase_pdf = volume_shader_phase_eval(kg, state, sd, phases, ls->D, &phase_eval);
+  float phase_pdf = volume_shader_phase_eval(kg, state, sd, phases, ls.D, &phase_eval);

-  if (ls->shader & SHADER_USE_MIS) {
-    float mis_weight = light_sample_mis_weight_nee(kg, ls->pdf, phase_pdf);
+  if (ls.shader & SHADER_USE_MIS) {
+    float mis_weight = light_sample_mis_weight_nee(kg, ls.pdf, phase_pdf);
    bsdf_eval_mul(&phase_eval, mis_weight);
  }

-  bsdf_eval_mul(&phase_eval, light_eval / ls->pdf);
+  bsdf_eval_mul(&phase_eval, light_eval / ls.pdf);

  /* Path termination. */
  const float terminate = path_state_rng_light_termination(kg, rng_state);
-  if (light_sample_terminate(kg, ls, &phase_eval, terminate)) {
+  if (light_sample_terminate(kg, &ls, &phase_eval, terminate)) {
    return;
  }

  /* Create shadow ray. */
  Ray ray ccl_optional_struct_init;
-  light_sample_to_volume_shadow_ray(kg, sd, ls, P, &ray);
-  const bool is_light = light_sample_is_light(ls);
+  light_sample_to_volume_shadow_ray(kg, sd, &ls, P, &ray);
+  const bool is_light = light_sample_is_light(&ls);

  /* Branch off shadow kernel. */
  IntegratorShadowState shadow_state = integrator_shadow_path_init(
@@ -849,14 +878,10 @@ ccl_device_forceinline void integrate_volume_direct_light(
      state, path, transmission_bounce);
  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, throughput) = throughput_phase;

-  if (kernel_data.kernel_features & KERNEL_FEATURE_SHADOW_PASS) {
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, unshadowed_throughput) = throughput;
-  }
-
  /* Write Lightgroup, +1 as lightgroup is int but we need to encode into a uint8_t. */
  INTEGRATOR_STATE_WRITE(
-      shadow_state, shadow_path, lightgroup) = (ls->type != LIGHT_BACKGROUND) ?
-                                                   ls->group + 1 :
+      shadow_state, shadow_path, lightgroup) = (ls.type != LIGHT_BACKGROUND) ?
+                                                   ls.group + 1 :
                                                   kernel_data.background.lightgroup + 1;

 #  ifdef __PATH_GUIDING__
@@ -958,10 +983,11 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(

  /* Update path state */
  INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = phase_pdf;
+  INTEGRATOR_STATE_WRITE(state, path, mis_origin_n) = zero_float3();
  INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = fminf(
      unguided_phase_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf));

-  path_state_next(kg, state, label);
+  path_state_next(kg, state, label, sd->flag);
  return true;
 }

@@ -983,12 +1009,11 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,

  /* Sample light ahead of volume stepping, for equiangular sampling. */
  /* TODO: distant lights are ignored now, but could instead use even distribution. */
-  LightSample ls ccl_optional_struct_init;
  const bool need_light_sample = !(INTEGRATOR_STATE(state, path, flag) & PATH_RAY_TERMINATE);
+  float3 equiangular_P = zero_float3();
  const bool have_equiangular_sample = need_light_sample &&
-                                       integrate_volume_sample_light(
-                                           kg, state, &sd, &rng_state, &ls) &&
-                                       (ls.t != FLT_MAX);
+                                       integrate_volume_equiangular_sample_light(
+                                           kg, state, ray, &sd, &rng_state, &equiangular_P);

  VolumeSampleMethod direct_sample_method = (have_equiangular_sample) ?
                                                volume_stack_sample_method(kg, state) :
@@ -1018,7 +1043,7 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
                                 render_buffer,
                                 step_size,
                                 direct_sample_method,
-                                 ls.P,
+                                 equiangular_P,
                                 result);

  /* Perform path termination. The intersect_closest will have already marked this path
@@ -1085,8 +1110,7 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
 #  ifdef __PATH_GUIDING__
                                  unlit_throughput,
 #  endif
-                                  result.direct_throughput,
-                                  &ls);
+                                  result.direct_throughput);
  }

  /* Indirect light.
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -32,7 +32,7 @@ KERNEL_STRUCT_MEMBER(shadow_path, PackedSpectrum, throughput, KERNEL_FEATURE_PAT
 KERNEL_STRUCT_MEMBER(shadow_path,
                     PackedSpectrum,
                     unshadowed_throughput,
-                     KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
+                     KERNEL_FEATURE_AO_ADDITIVE)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
 KERNEL_STRUCT_MEMBER(shadow_path, PackedSpectrum, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
 KERNEL_STRUCT_MEMBER(shadow_path, PackedSpectrum, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -41,6 +41,7 @@ KERNEL_STRUCT_MEMBER(path, uint8_t, mnee, KERNEL_FEATURE_PATH_TRACING)
 * zero and distance. Note that transparency and volume attenuation increase
 * the ray tmin but keep P unmodified so that this works. */
 KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, packed_float3, mis_origin_n, KERNEL_FEATURE_PATH_TRACING)
 /* Filter glossy. */
 KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
 /* Continuation probability for path termination. */
--- a/intern/cycles/kernel/integrator/surface_shader.h
+++ b/intern/cycles/kernel/integrator/surface_shader.h
@@ -827,13 +827,8 @@ ccl_device void surface_shader_eval(KernelGlobals kg,
  sd->num_closure_left = max_closures;

 #ifdef __OSL__
-  if (kg->osl) {
-    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, state, sd, path_flag);
-    }
-    else {
-      OSLShader::eval_surface(kg, state, sd, path_flag);
-    }
+  if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+    osl_eval_nodes<SHADER_TYPE_SURFACE>(kg, state, sd, path_flag);
  }
  else
 #endif
--- a/intern/cycles/kernel/integrator/volume_shader.h
+++ b/intern/cycles/kernel/integrator/volume_shader.h
@@ -491,8 +491,8 @@ ccl_device_inline void volume_shader_eval(KernelGlobals kg,

    /* evaluate shader */
 #  ifdef __OSL__
-    if (kg->osl) {
-      OSLShader::eval_volume(kg, state, sd, path_flag);
+    if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+      osl_eval_nodes<SHADER_TYPE_VOLUME>(kg, state, sd, path_flag);
    }
    else
 #  endif
--- a/intern/cycles/kernel/light/area.h
+++ b/intern/cycles/kernel/light/area.h
@@ -0,0 +1,387 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/light/common.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Importance sampling.
+ *
+ * An Area-Preserving Parametrization for Spherical Rectangles.
+ * Carlos Urena et al.
+ *
+ * NOTE: light_p is modified when sample_coord is true. */
+ccl_device_inline float area_light_rect_sample(float3 P,
+                                               ccl_private float3 *light_p,
+                                               const float3 axis_u,
+                                               const float len_u,
+                                               const float3 axis_v,
+                                               const float len_v,
+                                               float randu,
+                                               float randv,
+                                               bool sample_coord)
+{
+  /* In our name system we're using P for the center, which is o in the paper. */
+  float3 corner = *light_p - axis_u * len_u * 0.5f - axis_v * len_v * 0.5f;
+  /* Compute local reference system R. */
+  float3 x = axis_u;
+  float3 y = axis_v;
+  float3 z = cross(x, y);
+  /* Compute rectangle coords in local reference system. */
+  float3 dir = corner - P;
+  float z0 = dot(dir, z);
+  /* Flip 'z' to make it point against Q. */
+  if (z0 > 0.0f) {
+    z *= -1.0f;
+    z0 *= -1.0f;
+  }
+  float x0 = dot(dir, x);
+  float y0 = dot(dir, y);
+  float x1 = x0 + len_u;
+  float y1 = y0 + len_v;
+  /* Compute internal angles (gamma_i). */
+  float4 diff = make_float4(x0, y1, x1, y0) - make_float4(x1, y0, x0, y1);
+  float4 nz = make_float4(y0, x1, y1, x0) * diff;
+  nz = nz / sqrt(z0 * z0 * diff * diff + nz * nz);
+  float g0 = safe_acosf(-nz.x * nz.y);
+  float g1 = safe_acosf(-nz.y * nz.z);
+  float g2 = safe_acosf(-nz.z * nz.w);
+  float g3 = safe_acosf(-nz.w * nz.x);
+  /* Compute predefined constants. */
+  float b0 = nz.x;
+  float b1 = nz.z;
+  float b0sq = b0 * b0;
+  float k = M_2PI_F - g2 - g3;
+  /* Compute solid angle from internal angles. */
+  float S = g0 + g1 - k;
+
+  if (sample_coord) {
+    /* Compute cu. */
+    float au = randu * S + k;
+    float fu = (cosf(au) * b0 - b1) / sinf(au);
+    float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
+    cu = clamp(cu, -1.0f, 1.0f);
+    /* Compute xu. */
+    float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
+    xu = clamp(xu, x0, x1);
+    /* Compute yv. */
+    float z0sq = z0 * z0;
+    float y0sq = y0 * y0;
+    float y1sq = y1 * y1;
+    float d = sqrtf(xu * xu + z0sq);
+    float h0 = y0 / sqrtf(d * d + y0sq);
+    float h1 = y1 / sqrtf(d * d + y1sq);
+    float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
+    float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
+
+    /* Transform (xu, yv, z0) to world coords. */
+    *light_p = P + xu * x + yv * y + z0 * z;
+  }
+
+  /* return pdf */
+  if (S != 0.0f)
+    return 1.0f / S;
+  else
+    return 0.0f;
+}
+
+/* Light spread. */
+
+ccl_device float area_light_spread_attenuation(const float3 D,
+                                               const float3 lightNg,
+                                               const float cot_half_spread,
+                                               const float normalize_spread)
+{
+  /* Model a soft-box grid, computing the ratio of light not hidden by the
+   * slats of the grid at a given angle. (see D10594). */
+  const float cos_a = -dot(D, lightNg);
+  const float sin_a = safe_sqrtf(1.0f - sqr(cos_a));
+  const float tan_a = sin_a / cos_a;
+  return max((1.0f - (cot_half_spread * tan_a)) * normalize_spread, 0.0f);
+}
+
+/* Compute subset of area light that actually has an influence on the shading point, to
+ * reduce noise with low spread. */
+ccl_device bool area_light_spread_clamp_area_light(const float3 P,
+                                                   const float3 lightNg,
+                                                   ccl_private float3 *lightP,
+                                                   const float3 axis_u,
+                                                   ccl_private float *len_u,
+                                                   const float3 axis_v,
+                                                   ccl_private float *len_v,
+                                                   const float cot_half_spread)
+{
+  /* Closest point in area light plane and distance to that plane. */
+  const float3 closest_P = P - dot(lightNg, P - *lightP) * lightNg;
+  const float t = len(closest_P - P);
+
+  /* Radius of circle on area light that actually affects the shading point. */
+  const float radius = t / cot_half_spread;
+
+  /* Local uv coordinates of closest point. */
+  const float closest_u = dot(axis_u, closest_P - *lightP);
+  const float closest_v = dot(axis_v, closest_P - *lightP);
+
+  /* Compute rectangle encompassing the circle that affects the shading point,
+   * clamped to the bounds of the area light. */
+  const float min_u = max(closest_u - radius, -*len_u * 0.5f);
+  const float max_u = min(closest_u + radius, *len_u * 0.5f);
+  const float min_v = max(closest_v - radius, -*len_v * 0.5f);
+  const float max_v = min(closest_v + radius, *len_v * 0.5f);
+
+  /* Skip if rectangle is empty. */
+  if (min_u >= max_u || min_v >= max_v) {
+    return false;
+  }
+
+  /* Compute new area light center position and axes from rectangle in local
+   * uv coordinates. */
+  const float new_center_u = 0.5f * (min_u + max_u);
+  const float new_center_v = 0.5f * (min_v + max_v);
+  *len_u = max_u - min_u;
+  *len_v = max_v - min_v;
+
+  *lightP = *lightP + new_center_u * axis_u + new_center_v * axis_v;
+
+  return true;
+}
+
+/* Common API. */
+
+template<bool in_volume_segment>
+ccl_device_inline bool area_light_sample(const ccl_global KernelLight *klight,
+                                         const float randu,
+                                         const float randv,
+                                         const float3 P,
+                                         ccl_private LightSample *ls)
+{
+  ls->P = klight->co;
+
+  const float3 axis_u = klight->area.axis_u;
+  const float3 axis_v = klight->area.axis_v;
+  const float len_u = klight->area.len_u;
+  const float len_v = klight->area.len_v;
+  float3 Ng = klight->area.dir;
+  float invarea = fabsf(klight->area.invarea);
+  bool is_round = (klight->area.invarea < 0.0f);
+
+  if (!in_volume_segment) {
+    if (dot(ls->P - P, Ng) > 0.0f) {
+      return false;
+    }
+  }
+
+  float3 inplane;
+
+  if (is_round || in_volume_segment) {
+    inplane = ellipse_sample(axis_u * len_u * 0.5f, axis_v * len_v * 0.5f, randu, randv);
+    ls->P += inplane;
+    ls->pdf = invarea;
+  }
+  else {
+    inplane = ls->P;
+
+    float sample_len_u = len_u;
+    float sample_len_v = len_v;
+
+    if (!in_volume_segment && klight->area.cot_half_spread > 0.0f) {
+      if (!area_light_spread_clamp_area_light(P,
+                                              Ng,
+                                              &ls->P,
+                                              axis_u,
+                                              &sample_len_u,
+                                              axis_v,
+                                              &sample_len_v,
+                                              klight->area.cot_half_spread)) {
+        return false;
+      }
+    }
+
+    ls->pdf = area_light_rect_sample(
+        P, &ls->P, axis_u, sample_len_u, axis_v, sample_len_v, randu, randv, true);
+    inplane = ls->P - inplane;
+  }
+
+  const float light_u = dot(inplane, axis_u) / len_u;
+  const float light_v = dot(inplane, axis_v) / len_v;
+
+  /* NOTE: Return barycentric coordinates in the same notation as Embree and OptiX. */
+  ls->u = light_v + 0.5f;
+  ls->v = -light_u - light_v;
+
+  ls->Ng = Ng;
+  ls->D = normalize_len(ls->P - P, &ls->t);
+
+  ls->eval_fac = 0.25f * invarea;
+
+  if (klight->area.cot_half_spread > 0.0f) {
+    /* Area Light spread angle attenuation */
+    ls->eval_fac *= area_light_spread_attenuation(
+        ls->D, ls->Ng, klight->area.cot_half_spread, klight->area.normalize_spread);
+  }
+
+  if (is_round) {
+    ls->pdf *= lamp_light_pdf(Ng, -ls->D, ls->t);
+  }
+
+  return true;
+}
+
+ccl_device_forceinline void area_light_update_position(const ccl_global KernelLight *klight,
+                                                       ccl_private LightSample *ls,
+                                                       const float3 P)
+{
+  const float invarea = fabsf(klight->area.invarea);
+  ls->D = normalize_len(ls->P - P, &ls->t);
+  ls->pdf = invarea;
+
+  if (klight->area.cot_half_spread > 0.f) {
+    ls->eval_fac = 0.25f * invarea;
+    ls->eval_fac *= area_light_spread_attenuation(
+        ls->D, ls->Ng, klight->area.cot_half_spread, klight->area.normalize_spread);
+  }
+}
+
+ccl_device_inline bool area_light_intersect(const ccl_global KernelLight *klight,
+                                            const ccl_private Ray *ccl_restrict ray,
+                                            ccl_private float *t,
+                                            ccl_private float *u,
+                                            ccl_private float *v)
+{
+  /* Area light. */
+  const float invarea = fabsf(klight->area.invarea);
+  const bool is_round = (klight->area.invarea < 0.0f);
+  if (invarea == 0.0f) {
+    return false;
+  }
+
+  const float3 inv_extent_u = klight->area.axis_u / klight->area.len_u;
+  const float3 inv_extent_v = klight->area.axis_v / klight->area.len_v;
+  const float3 Ng = klight->area.dir;
+
+  /* One sided. */
+  if (dot(ray->D, Ng) >= 0.0f) {
+    return false;
+  }
+
+  const float3 light_P = klight->co;
+
+  float3 P;
+  return ray_quad_intersect(ray->P,
+                            ray->D,
+                            ray->tmin,
+                            ray->tmax,
+                            light_P,
+                            inv_extent_u,
+                            inv_extent_v,
+                            Ng,
+                            &P,
+                            t,
+                            u,
+                            v,
+                            is_round);
+}
+
+ccl_device_inline bool area_light_sample_from_intersection(
+    const ccl_global KernelLight *klight,
+    ccl_private const Intersection *ccl_restrict isect,
+    const float3 ray_P,
+    const float3 ray_D,
+    ccl_private LightSample *ccl_restrict ls)
+{
+
+  /* area light */
+  float invarea = fabsf(klight->area.invarea);
+
+  float3 Ng = klight->area.dir;
+  float3 light_P = klight->co;
+
+  ls->u = isect->u;
+  ls->v = isect->v;
+  ls->D = ray_D;
+  ls->Ng = Ng;
+
+  const bool is_round = (klight->area.invarea < 0.0f);
+  if (is_round) {
+    ls->pdf = invarea * lamp_light_pdf(Ng, -ray_D, ls->t);
+  }
+  else {
+    const float3 axis_u = klight->area.axis_u;
+    const float3 axis_v = klight->area.axis_v;
+    float sample_len_u = klight->area.len_u;
+    float sample_len_v = klight->area.len_v;
+
+    if (klight->area.cot_half_spread > 0.0f) {
+      if (!area_light_spread_clamp_area_light(ray_P,
+                                              Ng,
+                                              &light_P,
+                                              axis_u,
+                                              &sample_len_u,
+                                              axis_v,
+                                              &sample_len_v,
+                                              klight->area.cot_half_spread)) {
+        return false;
+      }
+    }
+
+    ls->pdf = area_light_rect_sample(
+        ray_P, &light_P, axis_u, sample_len_u, axis_v, sample_len_v, 0, 0, false);
+  }
+  ls->eval_fac = 0.25f * invarea;
+
+  if (klight->area.cot_half_spread > 0.0f) {
+    /* Area Light spread angle attenuation */
+    ls->eval_fac *= area_light_spread_attenuation(
+        ls->D, ls->Ng, klight->area.cot_half_spread, klight->area.normalize_spread);
+    if (ls->eval_fac == 0.0f) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template<bool in_volume_segment>
+ccl_device_forceinline bool area_light_tree_parameters(const ccl_global KernelLight *klight,
+                                                       const float3 centroid,
+                                                       const float3 P,
+                                                       const float3 N,
+                                                       const float3 bcone_axis,
+                                                       ccl_private float &cos_theta_u,
+                                                       ccl_private float2 &distance,
+                                                       ccl_private float3 &point_to_centroid)
+{
+  if (!in_volume_segment) {
+    /* TODO: a cheap substitute for minimal distance between point and primitive. Does it
+     * worth the overhead to compute the accurate minimal distance? */
+    float min_distance;
+    point_to_centroid = safe_normalize_len(centroid - P, &min_distance);
+    distance = make_float2(min_distance, min_distance);
+  }
+
+  cos_theta_u = FLT_MAX;
+
+  const float3 extentu = klight->area.axis_u * klight->area.len_u;
+  const float3 extentv = klight->area.axis_v * klight->area.len_v;
+  for (int i = 0; i < 4; i++) {
+    const float3 corner = ((i & 1) - 0.5f) * extentu + 0.5f * ((i & 2) - 1) * extentv + centroid;
+    float distance_point_to_corner;
+    const float3 point_to_corner = safe_normalize_len(corner - P, &distance_point_to_corner);
+    cos_theta_u = fminf(cos_theta_u, dot(point_to_centroid, point_to_corner));
+    if (!in_volume_segment) {
+      distance.x = fmaxf(distance.x, distance_point_to_corner);
+    }
+  }
+
+  const bool front_facing = dot(bcone_axis, point_to_centroid) < 0;
+  const bool shape_above_surface = dot(N, centroid - P) + fabsf(dot(N, extentu)) +
+                                       fabsf(dot(N, extentv)) >
+                                   0;
+  const bool in_volume = is_zero(N);
+
+  return (front_facing && shape_above_surface) || in_volume;
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/light/background.h
+++ b/intern/cycles/kernel/light/background.h
@@ -3,6 +3,7 @@

 #pragma once

+#include "kernel/light/area.h"
 #include "kernel/light/common.h"

 CCL_NAMESPACE_BEGIN
@@ -130,11 +131,11 @@ ccl_device float background_map_pdf(KernelGlobals kg, float3 direction)
 ccl_device_inline bool background_portal_data_fetch_and_check_side(
    KernelGlobals kg, float3 P, int index, ccl_private float3 *lightpos, ccl_private float3 *dir)
 {
-  int portal = kernel_data.background.portal_offset + index;
+  int portal = kernel_data.integrator.portal_offset + index;
  const ccl_global KernelLight *klight = &kernel_data_fetch(lights, portal);

-  *lightpos = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-  *dir = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
+  *lightpos = klight->co;
+  *dir = klight->area.dir;

  /* Check whether portal is on the right side. */
  if (dot(*dir, P - *lightpos) > 1e-4f)
@@ -149,7 +150,7 @@ ccl_device_inline float background_portal_pdf(
  float portal_pdf = 0.0f;

  int num_possible = 0;
-  for (int p = 0; p < kernel_data.background.num_portals; p++) {
+  for (int p = 0; p < kernel_data.integrator.num_portals; p++) {
    if (p == ignore_portal)
      continue;

@@ -163,12 +164,16 @@ ccl_device_inline float background_portal_pdf(
    }
    num_possible++;

-    int portal = kernel_data.background.portal_offset + p;
+    int portal = kernel_data.integrator.portal_offset + p;
    const ccl_global KernelLight *klight = &kernel_data_fetch(lights, portal);
-    float3 axisu = make_float3(
-        klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
-    float3 axisv = make_float3(
-        klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+
+    const float3 axis_u = klight->area.axis_u;
+    const float len_u = klight->area.len_u;
+    const float3 axis_v = klight->area.axis_v;
+    const float len_v = klight->area.len_v;
+    const float3 inv_extent_u = axis_u / len_u;
+    const float3 inv_extent_v = axis_v / len_v;
+
    bool is_round = (klight->area.invarea < 0.0f);

    if (!ray_quad_intersect(P,
@@ -176,8 +181,8 @@ ccl_device_inline float background_portal_pdf(
                            1e-4f,
                            FLT_MAX,
                            lightpos,
-                            axisu,
-                            axisv,
+                            inv_extent_u,
+                            inv_extent_v,
                            dir,
                            NULL,
                            NULL,
@@ -189,10 +194,11 @@ ccl_device_inline float background_portal_pdf(
    if (is_round) {
      float t;
      float3 D = normalize_len(lightpos - P, &t);
-      portal_pdf += fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
+      portal_pdf += fabsf(klight->area.invarea) * lamp_light_pdf(dir, -D, t);
    }
    else {
-      portal_pdf += rect_light_sample(P, &lightpos, axisu, axisv, 0.0f, 0.0f, false);
+      portal_pdf += area_light_rect_sample(
+          P, &lightpos, axis_u, len_u, axis_v, len_v, 0.0f, 0.0f, false);
    }
  }

@@ -207,7 +213,7 @@ ccl_device_inline float background_portal_pdf(
 ccl_device int background_num_possible_portals(KernelGlobals kg, float3 P)
 {
  int num_possible_portals = 0;
-  for (int p = 0; p < kernel_data.background.num_portals; p++) {
+  for (int p = 0; p < kernel_data.integrator.num_portals; p++) {
    float3 lightpos, dir;
    if (background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
      num_possible_portals++;
@@ -231,7 +237,7 @@ ccl_device float3 background_portal_sample(KernelGlobals kg,
  /* TODO(sergey): Some smarter way of finding portal to sample
   * is welcome.
   */
-  for (int p = 0; p < kernel_data.background.num_portals; p++) {
+  for (int p = 0; p < kernel_data.integrator.num_portals; p++) {
    /* Search for the sampled portal. */
    float3 lightpos, dir;
    if (!background_portal_data_fetch_and_check_side(kg, P, p, &lightpos, &dir))
@@ -239,23 +245,24 @@ ccl_device float3 background_portal_sample(KernelGlobals kg,

    if (portal == 0) {
      /* p is the portal to be sampled. */
-      int portal = kernel_data.background.portal_offset + p;
+      int portal = kernel_data.integrator.portal_offset + p;
      const ccl_global KernelLight *klight = &kernel_data_fetch(lights, portal);
-      float3 axisu = make_float3(
-          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
-      float3 axisv = make_float3(
-          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
+      const float3 axis_u = klight->area.axis_u;
+      const float3 axis_v = klight->area.axis_v;
+      const float len_u = klight->area.len_u;
+      const float len_v = klight->area.len_v;
      bool is_round = (klight->area.invarea < 0.0f);

      float3 D;
      if (is_round) {
-        lightpos += ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
+        lightpos += ellipse_sample(axis_u * len_u * 0.5f, axis_v * len_v * 0.5f, randu, randv);
        float t;
        D = normalize_len(lightpos - P, &t);
-        *pdf = fabsf(klight->area.invarea) * lamp_light_pdf(kg, dir, -D, t);
+        *pdf = fabsf(klight->area.invarea) * lamp_light_pdf(dir, -D, t);
      }
      else {
-        *pdf = rect_light_sample(P, &lightpos, axisu, axisv, randu, randv, true);
+        *pdf = area_light_rect_sample(
+            P, &lightpos, axis_u, len_u, axis_v, len_v, randu, randv, true);
        D = normalize(lightpos - P);
      }

@@ -414,7 +421,7 @@ ccl_device float background_light_pdf(KernelGlobals kg, float3 P, float3 directi
  float pdf_fac = (portal_method_pdf + sun_method_pdf + map_method_pdf);
  if (pdf_fac == 0.0f) {
    /* Use uniform as a fallback if we can't use any strategy. */
-    return kernel_data.integrator.pdf_lights / M_4PI_F;
+    return 1.0f / M_4PI_F;
  }

  pdf_fac = 1.0f / pdf_fac;
@@ -430,7 +437,21 @@ ccl_device float background_light_pdf(KernelGlobals kg, float3 P, float3 directi
    pdf += background_map_pdf(kg, direction) * map_method_pdf;
  }

-  return pdf * kernel_data.integrator.pdf_lights;
+  return pdf;
+}
+
+ccl_device_forceinline bool background_light_tree_parameters(const float3 centroid,
+                                                             ccl_private float &cos_theta_u,
+                                                             ccl_private float2 &distance,
+                                                             ccl_private float3 &point_to_centroid)
+{
+  /* Cover the whole sphere */
+  cos_theta_u = -1.0f;
+
+  distance = make_float2(1.0f, 1.0f);
+  point_to_centroid = -centroid;
+
+  return true;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/light/common.h
+++ b/intern/cycles/kernel/light/common.h
@@ -7,92 +7,26 @@

 CCL_NAMESPACE_BEGIN

-/* Area light sampling */
+/* Light Sample Result */

-/* Uses the following paper:
- *
- * Carlos Urena et al.
- * An Area-Preserving Parametrization for Spherical Rectangles.
- *
- * https://www.solidangle.com/research/egsr2013_spherical_rectangle.pdf
- *
- * NOTE: light_p is modified when sample_coord is true.
- */
-ccl_device_inline float rect_light_sample(float3 P,
-                                          ccl_private float3 *light_p,
-                                          float3 axisu,
-                                          float3 axisv,
-                                          float randu,
-                                          float randv,
-                                          bool sample_coord)
-{
-  /* In our name system we're using P for the center,
-   * which is o in the paper.
-   */
+typedef struct LightSample {
+  float3 P;            /* position on light, or direction for distant light */
+  float3 Ng;           /* normal on light */
+  float3 D;            /* direction from shading point to light */
+  float t;             /* distance to light (FLT_MAX for distant light) */
+  float u, v;          /* parametric coordinate on primitive */
+  float pdf;           /* pdf for selecting light and point on light */
+  float pdf_selection; /* pdf for selecting light */
+  float eval_fac;      /* intensity multiplier */
+  int object;          /* object id for triangle/curve lights */
+  int prim;            /* primitive id for triangle/curve lights */
+  int shader;          /* shader id */
+  int lamp;            /* lamp id */
+  int group;           /* lightgroup */
+  LightType type;      /* type of light */
+} LightSample;

-  float3 corner = *light_p - axisu * 0.5f - axisv * 0.5f;
-  float axisu_len, axisv_len;
-  /* Compute local reference system R. */
-  float3 x = normalize_len(axisu, &axisu_len);
-  float3 y = normalize_len(axisv, &axisv_len);
-  float3 z = cross(x, y);
-  /* Compute rectangle coords in local reference system. */
-  float3 dir = corner - P;
-  float z0 = dot(dir, z);
-  /* Flip 'z' to make it point against Q. */
-  if (z0 > 0.0f) {
-    z *= -1.0f;
-    z0 *= -1.0f;
-  }
-  float x0 = dot(dir, x);
-  float y0 = dot(dir, y);
-  float x1 = x0 + axisu_len;
-  float y1 = y0 + axisv_len;
-  /* Compute internal angles (gamma_i). */
-  float4 diff = make_float4(x0, y1, x1, y0) - make_float4(x1, y0, x0, y1);
-  float4 nz = make_float4(y0, x1, y1, x0) * diff;
-  nz = nz / sqrt(z0 * z0 * diff * diff + nz * nz);
-  float g0 = safe_acosf(-nz.x * nz.y);
-  float g1 = safe_acosf(-nz.y * nz.z);
-  float g2 = safe_acosf(-nz.z * nz.w);
-  float g3 = safe_acosf(-nz.w * nz.x);
-  /* Compute predefined constants. */
-  float b0 = nz.x;
-  float b1 = nz.z;
-  float b0sq = b0 * b0;
-  float k = M_2PI_F - g2 - g3;
-  /* Compute solid angle from internal angles. */
-  float S = g0 + g1 - k;
-
-  if (sample_coord) {
-    /* Compute cu. */
-    float au = randu * S + k;
-    float fu = (cosf(au) * b0 - b1) / sinf(au);
-    float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
-    cu = clamp(cu, -1.0f, 1.0f);
-    /* Compute xu. */
-    float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
-    xu = clamp(xu, x0, x1);
-    /* Compute yv. */
-    float z0sq = z0 * z0;
-    float y0sq = y0 * y0;
-    float y1sq = y1 * y1;
-    float d = sqrtf(xu * xu + z0sq);
-    float h0 = y0 / sqrtf(d * d + y0sq);
-    float h1 = y1 / sqrtf(d * d + y1sq);
-    float hv = h0 + randv * (h1 - h0), hv2 = hv * hv;
-    float yv = (hv2 < 1.0f - 1e-6f) ? (hv * d) / sqrtf(1.0f - hv2) : y1;
-
-    /* Transform (xu, yv, z0) to world coords. */
-    *light_p = P + xu * x + yv * y + z0 * z;
-  }
-
-  /* return pdf */
-  if (S != 0.0f)
-    return 1.0f / S;
-  else
-    return 0.0f;
-}
+/* Utilities */

 ccl_device_inline float3 ellipse_sample(float3 ru, float3 rv, float randu, float randv)
 {
@@ -109,99 +43,7 @@ ccl_device float3 disk_light_sample(float3 v, float randu, float randv)
  return ellipse_sample(ru, rv, randu, randv);
 }

-ccl_device float3 distant_light_sample(float3 D, float radius, float randu, float randv)
-{
-  return normalize(D + disk_light_sample(D, randu, randv) * radius);
-}
-
-ccl_device float3
-sphere_light_sample(float3 P, float3 center, float radius, float randu, float randv)
-{
-  return disk_light_sample(normalize(P - center), randu, randv) * radius;
-}
-
-ccl_device float spot_light_attenuation(float3 dir, float spot_angle, float spot_smooth, float3 N)
-{
-  float attenuation = dot(dir, N);
-
-  if (attenuation <= spot_angle) {
-    attenuation = 0.0f;
-  }
-  else {
-    float t = attenuation - spot_angle;
-
-    if (t < spot_smooth && spot_smooth != 0.0f)
-      attenuation *= smoothstepf(t / spot_smooth);
-  }
-
-  return attenuation;
-}
-
-ccl_device float light_spread_attenuation(const float3 D,
-                                          const float3 lightNg,
-                                          const float tan_spread,
-                                          const float normalize_spread)
-{
-  /* Model a soft-box grid, computing the ratio of light not hidden by the
-   * slats of the grid at a given angle. (see D10594). */
-  const float cos_a = -dot(D, lightNg);
-  const float sin_a = safe_sqrtf(1.0f - sqr(cos_a));
-  const float tan_a = sin_a / cos_a;
-  return max((1.0f - (tan_spread * tan_a)) * normalize_spread, 0.0f);
-}
-
-/* Compute subset of area light that actually has an influence on the shading point, to
- * reduce noise with low spread. */
-ccl_device bool light_spread_clamp_area_light(const float3 P,
-                                              const float3 lightNg,
-                                              ccl_private float3 *lightP,
-                                              ccl_private float3 *axisu,
-                                              ccl_private float3 *axisv,
-                                              const float tan_spread)
-{
-  /* Closest point in area light plane and distance to that plane. */
-  const float3 closest_P = P - dot(lightNg, P - *lightP) * lightNg;
-  const float t = len(closest_P - P);
-
-  /* Radius of circle on area light that actually affects the shading point. */
-  const float radius = t / tan_spread;
-
-  /* TODO: would be faster to store as normalized vector + length, also in rect_light_sample. */
-  float len_u, len_v;
-  const float3 u = normalize_len(*axisu, &len_u);
-  const float3 v = normalize_len(*axisv, &len_v);
-
-  /* Local uv coordinates of closest point. */
-  const float closest_u = dot(u, closest_P - *lightP);
-  const float closest_v = dot(v, closest_P - *lightP);
-
-  /* Compute rectangle encompassing the circle that affects the shading point,
-   * clamped to the bounds of the area light. */
-  const float min_u = max(closest_u - radius, -len_u * 0.5f);
-  const float max_u = min(closest_u + radius, len_u * 0.5f);
-  const float min_v = max(closest_v - radius, -len_v * 0.5f);
-  const float max_v = min(closest_v + radius, len_v * 0.5f);
-
-  /* Skip if rectangle is empty. */
-  if (min_u >= max_u || min_v >= max_v) {
-    return false;
-  }
-
-  /* Compute new area light center position and axes from rectangle in local
-   * uv coordinates. */
-  const float new_center_u = 0.5f * (min_u + max_u);
-  const float new_center_v = 0.5f * (min_v + max_v);
-  const float new_len_u = max_u - min_u;
-  const float new_len_v = max_v - min_v;
-
-  *lightP = *lightP + new_center_u * u + new_center_v * v;
-  *axisu = u * new_len_u;
-  *axisv = v * new_len_v;
-
-  return true;
-}
-
-ccl_device float lamp_light_pdf(KernelGlobals kg, const float3 Ng, const float3 I, float t)
+ccl_device float lamp_light_pdf(const float3 Ng, const float3 I, float t)
 {
  float cos_pi = dot(Ng, I);

--- a/intern/cycles/kernel/light/distant.h
+++ b/intern/cycles/kernel/light/distant.h
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/geom/geom.h"
+
+#include "kernel/light/common.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline bool distant_light_sample(const ccl_global KernelLight *klight,
+                                            const float randu,
+                                            const float randv,
+                                            ccl_private LightSample *ls)
+{
+  /* distant light */
+  float3 lightD = klight->co;
+  float3 D = lightD;
+  float radius = klight->distant.radius;
+  float invarea = klight->distant.invarea;
+
+  if (radius > 0.0f) {
+    D = normalize(D + disk_light_sample(D, randu, randv) * radius);
+  }
+
+  ls->P = D;
+  ls->Ng = D;
+  ls->D = -D;
+  ls->t = FLT_MAX;
+
+  float costheta = dot(lightD, D);
+  ls->pdf = invarea / (costheta * costheta * costheta);
+  ls->eval_fac = ls->pdf;
+
+  return true;
+}
+
+ccl_device bool distant_light_sample_from_intersection(KernelGlobals kg,
+                                                       const float3 ray_D,
+                                                       const int lamp,
+                                                       ccl_private LightSample *ccl_restrict ls)
+{
+  ccl_global const KernelLight *klight = &kernel_data_fetch(lights, lamp);
+  const int shader = klight->shader_id;
+  const float radius = klight->distant.radius;
+  const LightType type = (LightType)klight->type;
+
+  if (type != LIGHT_DISTANT) {
+    return false;
+  }
+  if (!(shader & SHADER_USE_MIS)) {
+    return false;
+  }
+  if (radius == 0.0f) {
+    return false;
+  }
+
+  /* a distant light is infinitely far away, but equivalent to a disk
+   * shaped light exactly 1 unit away from the current shading point.
+   *
+   *     radius              t^2/cos(theta)
+   *  <---------->           t = sqrt(1^2 + tan(theta)^2)
+   *       tan(th)           area = radius*radius*pi
+   *       <----->
+   *        \    |           (1 + tan(theta)^2)/cos(theta)
+   *         \   |           (1 + tan(acos(cos(theta)))^2)/cos(theta)
+   *       t  \th| 1         simplifies to
+   *           \-|           1/(cos(theta)^3)
+   *            \|           magic!
+   *             P
+   */
+
+  float3 lightD = klight->co;
+  float costheta = dot(-lightD, ray_D);
+  float cosangle = klight->distant.cosangle;
+
+  /* Workaround to prevent a hang in the classroom scene with AMD HIP drivers 22.10,
+   * Remove when a compiler fix is available. */
+#ifdef __HIP__
+  ls->shader = klight->shader_id;
+#endif
+
+  if (costheta < cosangle)
+    return false;
+
+  ls->type = type;
+#ifndef __HIP__
+  ls->shader = klight->shader_id;
+#endif
+  ls->object = PRIM_NONE;
+  ls->prim = PRIM_NONE;
+  ls->lamp = lamp;
+  /* todo: missing texture coordinates */
+  ls->u = 0.0f;
+  ls->v = 0.0f;
+  ls->t = FLT_MAX;
+  ls->P = -ray_D;
+  ls->Ng = -ray_D;
+  ls->D = ray_D;
+  ls->group = lamp_lightgroup(kg, lamp);
+
+  /* compute pdf */
+  float invarea = klight->distant.invarea;
+  ls->pdf = invarea / (costheta * costheta * costheta);
+  ls->eval_fac = ls->pdf;
+
+  return true;
+}
+
+ccl_device_forceinline bool distant_light_tree_parameters(const float3 centroid,
+                                                          const float theta_e,
+                                                          ccl_private float &cos_theta_u,
+                                                          ccl_private float2 &distance,
+                                                          ccl_private float3 &point_to_centroid)
+{
+  /* Treating it as a disk light 1 unit away */
+  cos_theta_u = fast_cosf(theta_e);
+
+  distance = make_float2(1.0f / cos_theta_u, 1.0f);
+
+  point_to_centroid = -centroid;
+
+  return true;
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/light/distribution.h
+++ b/intern/cycles/kernel/light/distribution.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/light/light.h"
+#include "kernel/light/triangle.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Simple CDF based sampling over all lights in the scene, without taking into
+ * account shading position or normal. */
+
+ccl_device int light_distribution_sample(KernelGlobals kg, ccl_private float &randu)
+{
+  /* This is basically std::upper_bound as used by PBRT, to find a point light or
+   * triangle to emit from, proportional to area. a good improvement would be to
+   * also sample proportional to power, though it's not so well defined with
+   * arbitrary shaders. */
+  int first = 0;
+  int len = kernel_data.integrator.num_distribution + 1;
+  float r = randu;
+
+  do {
+    int half_len = len >> 1;
+    int middle = first + half_len;
+
+    if (r < kernel_data_fetch(light_distribution, middle).totarea) {
+      len = half_len;
+    }
+    else {
+      first = middle + 1;
+      len = len - half_len - 1;
+    }
+  } while (len > 0);
+
+  /* Clamping should not be needed but float rounding errors seem to
+   * make this fail on rare occasions. */
+  int index = clamp(first - 1, 0, kernel_data.integrator.num_distribution - 1);
+
+  /* Rescale to reuse random number. this helps the 2D samples within
+   * each area light be stratified as well. */
+  float distr_min = kernel_data_fetch(light_distribution, index).totarea;
+  float distr_max = kernel_data_fetch(light_distribution, index + 1).totarea;
+  randu = (r - distr_min) / (distr_max - distr_min);
+
+  return index;
+}
+
+ccl_device_noinline bool light_distribution_sample(KernelGlobals kg,
+                                                   ccl_private float &randu,
+                                                   const float randv,
+                                                   const float time,
+                                                   const float3 P,
+                                                   const int bounce,
+                                                   const uint32_t path_flag,
+                                                   ccl_private int &emitter_object,
+                                                   ccl_private int &emitter_prim,
+                                                   ccl_private int &emitter_shader_flag,
+                                                   ccl_private float &emitter_pdf_selection)
+{
+  /* Sample light index from distribution. */
+  const int index = light_distribution_sample(kg, randu);
+  ccl_global const KernelLightDistribution *kdistribution = &kernel_data_fetch(light_distribution,
+                                                                               index);
+
+  emitter_object = kdistribution->mesh_light.object_id;
+  emitter_prim = kdistribution->prim;
+  emitter_shader_flag = kdistribution->mesh_light.shader_flag;
+  emitter_pdf_selection = kernel_data.integrator.distribution_pdf_lights;
+
+  return true;
+}
+
+ccl_device_inline float light_distribution_pdf_lamp(KernelGlobals kg)
+{
+  return kernel_data.integrator.distribution_pdf_lights;
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -3,31 +3,18 @@

 #pragma once

-#include "kernel/geom/geom.h"
+#include "kernel/light/area.h"
 #include "kernel/light/background.h"
+#include "kernel/light/distant.h"
+#include "kernel/light/point.h"
+#include "kernel/light/spot.h"
+#include "kernel/light/triangle.h"
+
 #include "kernel/sample/mapping.h"

 CCL_NAMESPACE_BEGIN

-/* Light Sample result */
-
-typedef struct LightSample {
-  float3 P;       /* position on light, or direction for distant light */
-  float3 Ng;      /* normal on light */
-  float3 D;       /* direction from shading point to light */
-  float t;        /* distance to light (FLT_MAX for distant light) */
-  float u, v;     /* parametric coordinate on primitive */
-  float pdf;      /* light sampling probability density function */
-  float eval_fac; /* intensity multiplier */
-  int object;     /* object id for triangle/curve lights */
-  int prim;       /* primitive id for triangle/curve lights */
-  int shader;     /* shader id */
-  int lamp;       /* lamp id */
-  int group;      /* lightgroup */
-  LightType type; /* type of light */
-} LightSample;
-
-/* Regular Light */
+/* Sample point on an individual light. */

 template<bool in_volume_segment>
 ccl_device_inline bool light_sample(KernelGlobals kg,
@@ -63,28 +50,15 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
    ls->Ng = zero_float3();
    ls->D = zero_float3();
    ls->pdf = 1.0f;
+    ls->eval_fac = 0.0f;
    ls->t = FLT_MAX;
    return true;
  }

  if (type == LIGHT_DISTANT) {
-    /* distant light */
-    float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-    float3 D = lightD;
-    float radius = klight->distant.radius;
-    float invarea = klight->distant.invarea;
-
-    if (radius > 0.0f)
-      D = distant_light_sample(D, radius, randu, randv);
-
-    ls->P = D;
-    ls->Ng = D;
-    ls->D = -D;
-    ls->t = FLT_MAX;
-
-    float costheta = dot(lightD, D);
-    ls->pdf = invarea / (costheta * costheta * costheta);
-    ls->eval_fac = ls->pdf;
+    if (!distant_light_sample(klight, randu, randv, ls)) {
+      return false;
+    }
  }
  else if (type == LIGHT_BACKGROUND) {
    /* infinite area light (e.g. light dome or env light) */
@@ -96,139 +70,28 @@ ccl_device_inline bool light_sample(KernelGlobals kg,
    ls->t = FLT_MAX;
    ls->eval_fac = 1.0f;
  }
+  else if (type == LIGHT_SPOT) {
+    if (!spot_light_sample<in_volume_segment>(klight, randu, randv, P, ls)) {
+      return false;
+    }
+  }
+  else if (type == LIGHT_POINT) {
+    if (!point_light_sample<in_volume_segment>(klight, randu, randv, P, ls)) {
+      return false;
+    }
+  }
  else {
-    ls->P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-
-    if (type == LIGHT_SPOT) {
-      const float3 center = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-      const float radius = klight->spot.radius;
-      const float3 dir = make_float3(
-          klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
-      /* disk oriented normal */
-      const float3 lightN = normalize(P - center);
-      ls->P = center;
-
-      if (radius > 0.0f)
-        /* disk light */
-        ls->P += disk_light_sample(lightN, randu, randv) * radius;
-
-      const float invarea = klight->spot.invarea;
-      ls->pdf = invarea;
-
-      ls->D = normalize_len(ls->P - P, &ls->t);
-      /* we set the light normal to the outgoing direction to support texturing */
-      ls->Ng = -ls->D;
-
-      ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
-
-      /* spot light attenuation */
-      ls->eval_fac *= spot_light_attenuation(
-          dir, klight->spot.spot_angle, klight->spot.spot_smooth, -ls->D);
-      if (!in_volume_segment && ls->eval_fac == 0.0f) {
-        return false;
-      }
-
-      float2 uv = map_to_sphere(ls->Ng);
-      ls->u = uv.x;
-      ls->v = uv.y;
-
-      ls->pdf *= lamp_light_pdf(kg, lightN, -ls->D, ls->t);
-    }
-    else if (type == LIGHT_POINT) {
-      float3 center = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-      float radius = klight->spot.radius;
-      /* disk oriented normal */
-      const float3 lightN = normalize(P - center);
-      ls->P = center;
-
-      if (radius > 0.0f) {
-        ls->P += disk_light_sample(lightN, randu, randv) * radius;
-      }
-      ls->pdf = klight->spot.invarea;
-
-      ls->D = normalize_len(ls->P - P, &ls->t);
-      /* we set the light normal to the outgoing direction to support texturing */
-      ls->Ng = -ls->D;
-
-      ls->eval_fac = M_1_PI_F * 0.25f * klight->spot.invarea;
-      if (!in_volume_segment && ls->eval_fac == 0.0f) {
-        return false;
-      }
-
-      float2 uv = map_to_sphere(ls->Ng);
-      ls->u = uv.x;
-      ls->v = uv.y;
-      ls->pdf *= lamp_light_pdf(kg, lightN, -ls->D, ls->t);
-    }
-    else {
-      /* area light */
-      float3 axisu = make_float3(
-          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
-      float3 axisv = make_float3(
-          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
-      float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
-      float invarea = fabsf(klight->area.invarea);
-      bool is_round = (klight->area.invarea < 0.0f);
-
-      if (!in_volume_segment) {
-        if (dot(ls->P - P, Ng) > 0.0f) {
-          return false;
-        }
-      }
-
-      float3 inplane;
-
-      if (is_round || in_volume_segment) {
-        inplane = ellipse_sample(axisu * 0.5f, axisv * 0.5f, randu, randv);
-        ls->P += inplane;
-        ls->pdf = invarea;
-      }
-      else {
-        inplane = ls->P;
-
-        float3 sample_axisu = axisu;
-        float3 sample_axisv = axisv;
-
-        if (!in_volume_segment && klight->area.tan_spread > 0.0f) {
-          if (!light_spread_clamp_area_light(
-                  P, Ng, &ls->P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
-            return false;
-          }
-        }
-
-        ls->pdf = rect_light_sample(P, &ls->P, sample_axisu, sample_axisv, randu, randv, true);
-        inplane = ls->P - inplane;
-      }
-
-      const float light_u = dot(inplane, axisu) * (1.0f / dot(axisu, axisu));
-      const float light_v = dot(inplane, axisv) * (1.0f / dot(axisv, axisv));
-
-      /* NOTE: Return barycentric coordinates in the same notation as Embree and OptiX. */
-      ls->u = light_v + 0.5f;
-      ls->v = -light_u - light_v;
-
-      ls->Ng = Ng;
-      ls->D = normalize_len(ls->P - P, &ls->t);
-
-      ls->eval_fac = 0.25f * invarea;
-
-      if (klight->area.tan_spread > 0.0f) {
-        /* Area Light spread angle attenuation */
-        ls->eval_fac *= light_spread_attenuation(
-            ls->D, ls->Ng, klight->area.tan_spread, klight->area.normalize_spread);
-      }
-
-      if (is_round) {
-        ls->pdf *= lamp_light_pdf(kg, Ng, -ls->D, ls->t);
-      }
+    /* area light */
+    if (!area_light_sample<in_volume_segment>(klight, randu, randv, P, ls)) {
+      return false;
    }
  }

-  ls->pdf *= kernel_data.integrator.pdf_lights;
-
  return in_volume_segment || (ls->pdf > 0.0f);
 }

+/* Intersect ray with individual light. */
+
 ccl_device bool lights_intersect(KernelGlobals kg,
                                 IntegratorState state,
                                 ccl_private const Ray *ccl_restrict ray,
@@ -238,7 +101,7 @@ ccl_device bool lights_intersect(KernelGlobals kg,
                                 const int last_type,
                                 const uint32_t path_flag)
 {
-  for (int lamp = 0; lamp < kernel_data.integrator.num_all_lights; lamp++) {
+  for (int lamp = 0; lamp < kernel_data.integrator.num_lights; lamp++) {
    const ccl_global KernelLight *klight = &kernel_data_fetch(lights, lamp);

    if (path_flag & PATH_RAY_CAMERA) {
@@ -271,76 +134,17 @@ ccl_device bool lights_intersect(KernelGlobals kg,
    float t = 0.0f, u = 0.0f, v = 0.0f;

    if (type == LIGHT_SPOT) {
-      /* Spot/Disk light. */
-      const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-      const float radius = klight->spot.radius;
-      if (radius == 0.0f) {
-        continue;
-      }
-      /* disk oriented normal */
-      const float3 lightN = normalize(ray->P - lightP);
-      /* One sided. */
-      if (dot(ray->D, lightN) >= 0.0f) {
-        continue;
-      }
-
-      float3 P;
-      if (!ray_disk_intersect(
-              ray->P, ray->D, ray->tmin, ray->tmax, lightP, lightN, radius, &P, &t)) {
+      if (!spot_light_intersect(klight, ray, &t)) {
        continue;
      }
    }
    else if (type == LIGHT_POINT) {
-      /* Sphere light (aka, aligned disk light). */
-      const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-      const float radius = klight->spot.radius;
-      if (radius == 0.0f) {
-        continue;
-      }
-
-      /* disk oriented normal */
-      const float3 lightN = normalize(ray->P - lightP);
-      float3 P;
-      if (!ray_disk_intersect(
-              ray->P, ray->D, ray->tmin, ray->tmax, lightP, lightN, radius, &P, &t)) {
+      if (!point_light_intersect(klight, ray, &t)) {
        continue;
      }
    }
    else if (type == LIGHT_AREA) {
-      /* Area light. */
-      const float invarea = fabsf(klight->area.invarea);
-      const bool is_round = (klight->area.invarea < 0.0f);
-      if (invarea == 0.0f) {
-        continue;
-      }
-
-      const float3 axisu = make_float3(
-          klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
-      const float3 axisv = make_float3(
-          klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
-      const float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
-
-      /* One sided. */
-      if (dot(ray->D, Ng) >= 0.0f) {
-        continue;
-      }
-
-      const float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-
-      float3 P;
-      if (!ray_quad_intersect(ray->P,
-                              ray->D,
-                              ray->tmin,
-                              ray->tmax,
-                              light_P,
-                              axisu,
-                              axisv,
-                              Ng,
-                              &P,
-                              &t,
-                              &u,
-                              &v,
-                              is_round)) {
+      if (!area_light_intersect(klight, ray, &t, &u, &v)) {
        continue;
      }
    }
@@ -362,78 +166,7 @@ ccl_device bool lights_intersect(KernelGlobals kg,
  return isect->prim != PRIM_NONE;
 }

-ccl_device bool light_sample_from_distant_ray(KernelGlobals kg,
-                                              const float3 ray_D,
-                                              const int lamp,
-                                              ccl_private LightSample *ccl_restrict ls)
-{
-  ccl_global const KernelLight *klight = &kernel_data_fetch(lights, lamp);
-  const int shader = klight->shader_id;
-  const float radius = klight->distant.radius;
-  const LightType type = (LightType)klight->type;
-
-  if (type != LIGHT_DISTANT) {
-    return false;
-  }
-  if (!(shader & SHADER_USE_MIS)) {
-    return false;
-  }
-  if (radius == 0.0f) {
-    return false;
-  }
-
-  /* a distant light is infinitely far away, but equivalent to a disk
-   * shaped light exactly 1 unit away from the current shading point.
-   *
-   *     radius              t^2/cos(theta)
-   *  <---------->           t = sqrt(1^2 + tan(theta)^2)
-   *       tan(th)           area = radius*radius*pi
-   *       <----->
-   *        \    |           (1 + tan(theta)^2)/cos(theta)
-   *         \   |           (1 + tan(acos(cos(theta)))^2)/cos(theta)
-   *       t  \th| 1         simplifies to
-   *           \-|           1/(cos(theta)^3)
-   *            \|           magic!
-   *             P
-   */
-
-  float3 lightD = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-  float costheta = dot(-lightD, ray_D);
-  float cosangle = klight->distant.cosangle;
-
-  /* Workaround to prevent a hang in the classroom scene with AMD HIP drivers 22.10,
-   * Remove when a compiler fix is available. */
-#ifdef __HIP__
-  ls->shader = klight->shader_id;
-#endif
-
-  if (costheta < cosangle)
-    return false;
-
-  ls->type = type;
-#ifndef __HIP__
-  ls->shader = klight->shader_id;
-#endif
-  ls->object = PRIM_NONE;
-  ls->prim = PRIM_NONE;
-  ls->lamp = lamp;
-  /* todo: missing texture coordinates */
-  ls->u = 0.0f;
-  ls->v = 0.0f;
-  ls->t = FLT_MAX;
-  ls->P = -ray_D;
-  ls->Ng = -ray_D;
-  ls->D = ray_D;
-  ls->group = lamp_lightgroup(kg, lamp);
-
-  /* compute pdf */
-  float invarea = klight->distant.invarea;
-  ls->pdf = invarea / (costheta * costheta * costheta);
-  ls->eval_fac = ls->pdf;
-  ls->pdf *= kernel_data.integrator.pdf_lights;
-
-  return true;
-}
+/* Setup light sample from intersection. */

 ccl_device bool light_sample_from_intersection(KernelGlobals kg,
                                               ccl_private const Intersection *ccl_restrict isect,
@@ -456,102 +189,18 @@ ccl_device bool light_sample_from_intersection(KernelGlobals kg,
  ls->group = lamp_lightgroup(kg, lamp);

  if (type == LIGHT_SPOT) {
-    const float3 center = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-    const float3 dir = make_float3(klight->spot.dir[0], klight->spot.dir[1], klight->spot.dir[2]);
-    /* the normal of the oriented disk */
-    const float3 lightN = normalize(ray_P - center);
-    /* We set the light normal to the outgoing direction to support texturing. */
-    ls->Ng = -ls->D;
-
-    float invarea = klight->spot.invarea;
-    ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
-    ls->pdf = invarea;
-
-    /* spot light attenuation */
-    ls->eval_fac *= spot_light_attenuation(
-        dir, klight->spot.spot_angle, klight->spot.spot_smooth, -ls->D);
-
-    if (ls->eval_fac == 0.0f) {
+    if (!spot_light_sample_from_intersection(klight, isect, ray_P, ray_D, ls)) {
      return false;
    }
-
-    float2 uv = map_to_sphere(ls->Ng);
-    ls->u = uv.x;
-    ls->v = uv.y;
-
-    /* compute pdf */
-    if (ls->t != FLT_MAX)
-      ls->pdf *= lamp_light_pdf(kg, lightN, -ls->D, ls->t);
-    else
-      ls->pdf = 0.f;
  }
  else if (type == LIGHT_POINT) {
-    const float3 center = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-    const float3 lighN = normalize(ray_P - center);
-
-    /* We set the light normal to the outgoing direction to support texturing. */
-    ls->Ng = -ls->D;
-
-    float invarea = klight->spot.invarea;
-    ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
-    ls->pdf = invarea;
-
-    if (ls->eval_fac == 0.0f) {
+    if (!point_light_sample_from_intersection(klight, isect, ray_P, ray_D, ls)) {
      return false;
    }
-
-    float2 uv = map_to_sphere(ls->Ng);
-    ls->u = uv.x;
-    ls->v = uv.y;
-
-    /* compute pdf */
-    if (ls->t != FLT_MAX)
-      ls->pdf *= lamp_light_pdf(kg, lighN, -ls->D, ls->t);
-    else
-      ls->pdf = 0.f;
  }
  else if (type == LIGHT_AREA) {
-    /* area light */
-    float invarea = fabsf(klight->area.invarea);
-
-    float3 axisu = make_float3(
-        klight->area.axisu[0], klight->area.axisu[1], klight->area.axisu[2]);
-    float3 axisv = make_float3(
-        klight->area.axisv[0], klight->area.axisv[1], klight->area.axisv[2]);
-    float3 Ng = make_float3(klight->area.dir[0], klight->area.dir[1], klight->area.dir[2]);
-    float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);
-
-    ls->u = isect->u;
-    ls->v = isect->v;
-    ls->D = ray_D;
-    ls->Ng = Ng;
-
-    const bool is_round = (klight->area.invarea < 0.0f);
-    if (is_round) {
-      ls->pdf = invarea * lamp_light_pdf(kg, Ng, -ray_D, ls->t);
-    }
-    else {
-      float3 sample_axisu = axisu;
-      float3 sample_axisv = axisv;
-
-      if (klight->area.tan_spread > 0.0f) {
-        if (!light_spread_clamp_area_light(
-                ray_P, Ng, &light_P, &sample_axisu, &sample_axisv, klight->area.tan_spread)) {
-          return false;
-        }
-      }
-
-      ls->pdf = rect_light_sample(ray_P, &light_P, sample_axisu, sample_axisv, 0, 0, false);
-    }
-    ls->eval_fac = 0.25f * invarea;
-
-    if (klight->area.tan_spread > 0.0f) {
-      /* Area Light spread angle attenuation */
-      ls->eval_fac *= light_spread_attenuation(
-          ls->D, ls->Ng, klight->area.tan_spread, klight->area.normalize_spread);
-      if (ls->eval_fac == 0.0f) {
-        return false;
-      }
+    if (!area_light_sample_from_intersection(klight, isect, ray_P, ray_D, ls)) {
+      return false;
    }
  }
  else {
@@ -559,411 +208,33 @@ ccl_device bool light_sample_from_intersection(KernelGlobals kg,
    return false;
  }

-  ls->pdf *= kernel_data.integrator.pdf_lights;
-
  return true;
 }

-/* Triangle Light */
+/* Update light sample for changed new position, for MNEE. */

-/* returns true if the triangle is has motion blur or an instancing transform applied */
-ccl_device_inline bool triangle_world_space_vertices(
-    KernelGlobals kg, int object, int prim, float time, float3 V[3])
-{
-  bool has_motion = false;
-  const int object_flag = kernel_data_fetch(object_flag, object);
-
-  if (object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) {
-    motion_triangle_vertices(kg, object, prim, time, V);
-    has_motion = true;
-  }
-  else {
-    triangle_vertices(kg, prim, V);
-  }
-
-  if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
-#ifdef __OBJECT_MOTION__
-    float object_time = (time >= 0.0f) ? time : 0.5f;
-    Transform tfm = object_fetch_transform_motion_test(kg, object, object_time, NULL);
-#else
-    Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
-#endif
-    V[0] = transform_point(&tfm, V[0]);
-    V[1] = transform_point(&tfm, V[1]);
-    V[2] = transform_point(&tfm, V[2]);
-    has_motion = true;
-  }
-  return has_motion;
-}
-
-ccl_device_inline float triangle_light_pdf_area(KernelGlobals kg,
-                                                const float3 Ng,
-                                                const float3 I,
-                                                float t)
-{
-  float pdf = kernel_data.integrator.pdf_triangles;
-  float cos_pi = fabsf(dot(Ng, I));
-
-  if (cos_pi == 0.0f)
-    return 0.0f;
-
-  return t * t * pdf / cos_pi;
-}
-
-ccl_device_forceinline float triangle_light_pdf(KernelGlobals kg,
-                                                ccl_private const ShaderData *sd,
-                                                float t)
-{
-  /* A naive heuristic to decide between costly solid angle sampling
-   * and simple area sampling, comparing the distance to the triangle plane
-   * to the length of the edges of the triangle. */
-
-  float3 V[3];
-  bool has_motion = triangle_world_space_vertices(kg, sd->object, sd->prim, sd->time, V);
-
-  const float3 e0 = V[1] - V[0];
-  const float3 e1 = V[2] - V[0];
-  const float3 e2 = V[2] - V[1];
-  const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
-  const float3 N = cross(e0, e1);
-  const float distance_to_plane = fabsf(dot(N, sd->I * t)) / dot(N, N);
-
-  if (longest_edge_squared > distance_to_plane * distance_to_plane) {
-    /* sd contains the point on the light source
-     * calculate Px, the point that we're shading */
-    const float3 Px = sd->P + sd->I * t;
-    const float3 v0_p = V[0] - Px;
-    const float3 v1_p = V[1] - Px;
-    const float3 v2_p = V[2] - Px;
-
-    const float3 u01 = safe_normalize(cross(v0_p, v1_p));
-    const float3 u02 = safe_normalize(cross(v0_p, v2_p));
-    const float3 u12 = safe_normalize(cross(v1_p, v2_p));
-
-    const float alpha = fast_acosf(dot(u02, u01));
-    const float beta = fast_acosf(-dot(u01, u12));
-    const float gamma = fast_acosf(dot(u02, u12));
-    const float solid_angle = alpha + beta + gamma - M_PI_F;
-
-    /* pdf_triangles is calculated over triangle area, but we're not sampling over its area */
-    if (UNLIKELY(solid_angle == 0.0f)) {
-      return 0.0f;
-    }
-    else {
-      float area = 1.0f;
-      if (has_motion) {
-        /* get the center frame vertices, this is what the PDF was calculated from */
-        triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
-        area = triangle_area(V[0], V[1], V[2]);
-      }
-      else {
-        area = 0.5f * len(N);
-      }
-      const float pdf = area * kernel_data.integrator.pdf_triangles;
-      return pdf / solid_angle;
-    }
-  }
-  else {
-    float pdf = triangle_light_pdf_area(kg, sd->Ng, sd->I, t);
-    if (has_motion) {
-      const float area = 0.5f * len(N);
-      if (UNLIKELY(area == 0.0f)) {
-        return 0.0f;
-      }
-      /* scale the PDF.
-       * area = the area the sample was taken from
-       * area_pre = the are from which pdf_triangles was calculated from */
-      triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
-      const float area_pre = triangle_area(V[0], V[1], V[2]);
-      pdf = pdf * area_pre / area;
-    }
-    return pdf;
-  }
-}
-
-template<bool in_volume_segment>
-ccl_device_forceinline void triangle_light_sample(KernelGlobals kg,
-                                                  int prim,
-                                                  int object,
-                                                  float randu,
-                                                  float randv,
-                                                  float time,
+ccl_device_forceinline void light_update_position(KernelGlobals kg,
                                                  ccl_private LightSample *ls,
                                                  const float3 P)
 {
-  /* A naive heuristic to decide between costly solid angle sampling
-   * and simple area sampling, comparing the distance to the triangle plane
-   * to the length of the edges of the triangle. */
+  const ccl_global KernelLight *klight = &kernel_data_fetch(lights, ls->lamp);

-  float3 V[3];
-  bool has_motion = triangle_world_space_vertices(kg, object, prim, time, V);
-
-  const float3 e0 = V[1] - V[0];
-  const float3 e1 = V[2] - V[0];
-  const float3 e2 = V[2] - V[1];
-  const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
-  const float3 N0 = cross(e0, e1);
-  float Nl = 0.0f;
-  ls->Ng = safe_normalize_len(N0, &Nl);
-  float area = 0.5f * Nl;
-
-  /* flip normal if necessary */
-  const int object_flag = kernel_data_fetch(object_flag, object);
-  if (object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
-    ls->Ng = -ls->Ng;
+  if (ls->type == LIGHT_POINT) {
+    point_light_update_position(klight, ls, P);
  }
-  ls->eval_fac = 1.0f;
-  ls->shader = kernel_data_fetch(tri_shader, prim);
-  ls->object = object;
-  ls->prim = prim;
-  ls->lamp = LAMP_NONE;
-  ls->shader |= SHADER_USE_MIS;
-  ls->type = LIGHT_TRIANGLE;
-  ls->group = object_lightgroup(kg, object);
-
-  float distance_to_plane = fabsf(dot(N0, V[0] - P) / dot(N0, N0));
-
-  if (!in_volume_segment && (longest_edge_squared > distance_to_plane * distance_to_plane)) {
-    /* see James Arvo, "Stratified Sampling of Spherical Triangles"
-     * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
-
-    /* project the triangle to the unit sphere
-     * and calculate its edges and angles */
-    const float3 v0_p = V[0] - P;
-    const float3 v1_p = V[1] - P;
-    const float3 v2_p = V[2] - P;
-
-    const float3 u01 = safe_normalize(cross(v0_p, v1_p));
-    const float3 u02 = safe_normalize(cross(v0_p, v2_p));
-    const float3 u12 = safe_normalize(cross(v1_p, v2_p));
-
-    const float3 A = safe_normalize(v0_p);
-    const float3 B = safe_normalize(v1_p);
-    const float3 C = safe_normalize(v2_p);
-
-    const float cos_alpha = dot(u02, u01);
-    const float cos_beta = -dot(u01, u12);
-    const float cos_gamma = dot(u02, u12);
-
-    /* calculate dihedral angles */
-    const float alpha = fast_acosf(cos_alpha);
-    const float beta = fast_acosf(cos_beta);
-    const float gamma = fast_acosf(cos_gamma);
-    /* the area of the unit spherical triangle = solid angle */
-    const float solid_angle = alpha + beta + gamma - M_PI_F;
-
-    /* precompute a few things
-     * these could be re-used to take several samples
-     * as they are independent of randu/randv */
-    const float cos_c = dot(A, B);
-    const float sin_alpha = fast_sinf(alpha);
-    const float product = sin_alpha * cos_c;
-
-    /* Select a random sub-area of the spherical triangle
-     * and calculate the third vertex C_ of that new triangle */
-    const float phi = randu * solid_angle - alpha;
-    float s, t;
-    fast_sincosf(phi, &s, &t);
-    const float u = t - cos_alpha;
-    const float v = s + product;
-
-    const float3 U = safe_normalize(C - dot(C, A) * A);
-
-    float q = 1.0f;
-    const float det = ((v * s + u * t) * sin_alpha);
-    if (det != 0.0f) {
-      q = ((v * t - u * s) * cos_alpha - v) / det;
-    }
-    const float temp = max(1.0f - q * q, 0.0f);
-
-    const float3 C_ = safe_normalize(q * A + sqrtf(temp) * U);
-
-    /* Finally, select a random point along the edge of the new triangle
-     * That point on the spherical triangle is the sampled ray direction */
-    const float z = 1.0f - randv * (1.0f - dot(C_, B));
-    ls->D = z * B + safe_sqrtf(1.0f - z * z) * safe_normalize(C_ - dot(C_, B) * B);
-
-    /* calculate intersection with the planar triangle */
-    if (!ray_triangle_intersect(
-            P, ls->D, 0.0f, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) {
-      ls->pdf = 0.0f;
-      return;
-    }
-
-    ls->P = P + ls->D * ls->t;
-
-    /* pdf_triangles is calculated over triangle area, but we're sampling over solid angle */
-    if (UNLIKELY(solid_angle == 0.0f)) {
-      ls->pdf = 0.0f;
-      return;
-    }
-    else {
-      if (has_motion) {
-        /* get the center frame vertices, this is what the PDF was calculated from */
-        triangle_world_space_vertices(kg, object, prim, -1.0f, V);
-        area = triangle_area(V[0], V[1], V[2]);
-      }
-      const float pdf = area * kernel_data.integrator.pdf_triangles;
-      ls->pdf = pdf / solid_angle;
-    }
+  else if (ls->type == LIGHT_SPOT) {
+    spot_light_update_position(klight, ls, P);
  }
-  else {
-    /* compute random point in triangle. From Eric Heitz's "A Low-Distortion Map Between Triangle
-     * and Square" */
-    float u = randu;
-    float v = randv;
-    if (v > u) {
-      u *= 0.5f;
-      v -= u;
-    }
-    else {
-      v *= 0.5f;
-      u -= v;
-    }
-
-    const float t = 1.0f - u - v;
-    ls->P = u * V[0] + v * V[1] + t * V[2];
-    /* compute incoming direction, distance and pdf */
-    ls->D = normalize_len(ls->P - P, &ls->t);
-    ls->pdf = triangle_light_pdf_area(kg, ls->Ng, -ls->D, ls->t);
-    if (has_motion && area != 0.0f) {
-      /* scale the PDF.
-       * area = the area the sample was taken from
-       * area_pre = the are from which pdf_triangles was calculated from */
-      triangle_world_space_vertices(kg, object, prim, -1.0f, V);
-      const float area_pre = triangle_area(V[0], V[1], V[2]);
-      ls->pdf = ls->pdf * area_pre / area;
-    }
-    ls->u = u;
-    ls->v = v;
+  else if (ls->type == LIGHT_AREA) {
+    area_light_update_position(klight, ls, P);
  }
 }

-/* Light Distribution */
-
-ccl_device int light_distribution_sample(KernelGlobals kg, ccl_private float *randu)
-{
-  /* This is basically std::upper_bound as used by PBRT, to find a point light or
-   * triangle to emit from, proportional to area. a good improvement would be to
-   * also sample proportional to power, though it's not so well defined with
-   * arbitrary shaders. */
-  int first = 0;
-  int len = kernel_data.integrator.num_distribution + 1;
-  float r = *randu;
-
-  do {
-    int half_len = len >> 1;
-    int middle = first + half_len;
-
-    if (r < kernel_data_fetch(light_distribution, middle).totarea) {
-      len = half_len;
-    }
-    else {
-      first = middle + 1;
-      len = len - half_len - 1;
-    }
-  } while (len > 0);
-
-  /* Clamping should not be needed but float rounding errors seem to
-   * make this fail on rare occasions. */
-  int index = clamp(first - 1, 0, kernel_data.integrator.num_distribution - 1);
-
-  /* Rescale to reuse random number. this helps the 2D samples within
-   * each area light be stratified as well. */
-  float distr_min = kernel_data_fetch(light_distribution, index).totarea;
-  float distr_max = kernel_data_fetch(light_distribution, index + 1).totarea;
-  *randu = (r - distr_min) / (distr_max - distr_min);
-
-  return index;
-}
-
-/* Generic Light */
+/* Light info. */

 ccl_device_inline bool light_select_reached_max_bounces(KernelGlobals kg, int index, int bounce)
 {
  return (bounce > kernel_data_fetch(lights, index).max_bounces);
 }

-template<bool in_volume_segment>
-ccl_device_noinline bool light_distribution_sample(KernelGlobals kg,
-                                                   float randu,
-                                                   const float randv,
-                                                   const float time,
-                                                   const float3 P,
-                                                   const int bounce,
-                                                   const uint32_t path_flag,
-                                                   ccl_private LightSample *ls)
-{
-  /* Sample light index from distribution. */
-  const int index = light_distribution_sample(kg, &randu);
-  ccl_global const KernelLightDistribution *kdistribution = &kernel_data_fetch(light_distribution,
-                                                                               index);
-  const int prim = kdistribution->prim;
-
-  if (prim >= 0) {
-    /* Mesh light. */
-    const int object = kdistribution->mesh_light.object_id;
-
-    /* Exclude synthetic meshes from shadow catcher pass. */
-    if ((path_flag & PATH_RAY_SHADOW_CATCHER_PASS) &&
-        !(kernel_data_fetch(object_flag, object) & SD_OBJECT_SHADOW_CATCHER)) {
-      return false;
-    }
-
-    const int shader_flag = kdistribution->mesh_light.shader_flag;
-    triangle_light_sample<in_volume_segment>(kg, prim, object, randu, randv, time, ls, P);
-    ls->shader |= shader_flag;
-    return (ls->pdf > 0.0f);
-  }
-
-  const int lamp = -prim - 1;
-
-  if (UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
-    return false;
-  }
-
-  return light_sample<in_volume_segment>(kg, lamp, randu, randv, P, path_flag, ls);
-}
-
-ccl_device_inline bool light_distribution_sample_from_volume_segment(KernelGlobals kg,
-                                                                     float randu,
-                                                                     const float randv,
-                                                                     const float time,
-                                                                     const float3 P,
-                                                                     const int bounce,
-                                                                     const uint32_t path_flag,
-                                                                     ccl_private LightSample *ls)
-{
-  return light_distribution_sample<true>(kg, randu, randv, time, P, bounce, path_flag, ls);
-}
-
-ccl_device_inline bool light_distribution_sample_from_position(KernelGlobals kg,
-                                                               float randu,
-                                                               const float randv,
-                                                               const float time,
-                                                               const float3 P,
-                                                               const int bounce,
-                                                               const uint32_t path_flag,
-                                                               ccl_private LightSample *ls)
-{
-  return light_distribution_sample<false>(kg, randu, randv, time, P, bounce, path_flag, ls);
-}
-
-ccl_device_inline bool light_distribution_sample_new_position(KernelGlobals kg,
-                                                              const float randu,
-                                                              const float randv,
-                                                              const float time,
-                                                              const float3 P,
-                                                              ccl_private LightSample *ls)
-{
-  /* Sample a new position on the same light, for volume sampling. */
-  if (ls->type == LIGHT_TRIANGLE) {
-    triangle_light_sample<false>(kg, ls->prim, ls->object, randu, randv, time, ls, P);
-    return (ls->pdf > 0.0f);
-  }
-  else {
-    return light_sample<false>(kg, ls->lamp, randu, randv, P, 0, ls);
-  }
-}
-
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/light/point.h
+++ b/intern/cycles/kernel/light/point.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/light/common.h"
+
+CCL_NAMESPACE_BEGIN
+
+template<bool in_volume_segment>
+ccl_device_inline bool point_light_sample(const ccl_global KernelLight *klight,
+                                          const float randu,
+                                          const float randv,
+                                          const float3 P,
+                                          ccl_private LightSample *ls)
+{
+  float3 center = klight->co;
+  float radius = klight->spot.radius;
+  /* disk oriented normal */
+  const float3 lightN = normalize(P - center);
+  ls->P = center;
+
+  if (radius > 0.0f) {
+    ls->P += disk_light_sample(lightN, randu, randv) * radius;
+  }
+  ls->pdf = klight->spot.invarea;
+
+  ls->D = normalize_len(ls->P - P, &ls->t);
+  /* we set the light normal to the outgoing direction to support texturing */
+  ls->Ng = -ls->D;
+
+  ls->eval_fac = M_1_PI_F * 0.25f * klight->spot.invarea;
+  if (!in_volume_segment && ls->eval_fac == 0.0f) {
+    return false;
+  }
+
+  float2 uv = map_to_sphere(ls->Ng);
+  ls->u = uv.x;
+  ls->v = uv.y;
+  ls->pdf *= lamp_light_pdf(lightN, -ls->D, ls->t);
+  return true;
+}
+
+ccl_device_forceinline void point_light_update_position(const ccl_global KernelLight *klight,
+                                                        ccl_private LightSample *ls,
+                                                        const float3 P)
+{
+  ls->D = normalize_len(ls->P - P, &ls->t);
+  ls->Ng = -ls->D;
+
+  float2 uv = map_to_sphere(ls->Ng);
+  ls->u = uv.x;
+  ls->v = uv.y;
+
+  float invarea = klight->spot.invarea;
+  ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
+  ls->pdf = invarea;
+}
+
+ccl_device_inline bool point_light_intersect(const ccl_global KernelLight *klight,
+                                             const ccl_private Ray *ccl_restrict ray,
+                                             ccl_private float *t)
+{
+  /* Sphere light (aka, aligned disk light). */
+  const float3 lightP = klight->co;
+  const float radius = klight->spot.radius;
+  if (radius == 0.0f) {
+    return false;
+  }
+
+  /* disk oriented normal */
+  const float3 lightN = normalize(ray->P - lightP);
+  float3 P;
+  return ray_disk_intersect(ray->P, ray->D, ray->tmin, ray->tmax, lightP, lightN, radius, &P, t);
+}
+
+ccl_device_inline bool point_light_sample_from_intersection(
+    const ccl_global KernelLight *klight,
+    ccl_private const Intersection *ccl_restrict isect,
+    const float3 ray_P,
+    const float3 ray_D,
+    ccl_private LightSample *ccl_restrict ls)
+{
+  const float3 lighN = normalize(ray_P - klight->co);
+
+  /* We set the light normal to the outgoing direction to support texturing. */
+  ls->Ng = -ls->D;
+
+  float invarea = klight->spot.invarea;
+  ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
+  ls->pdf = invarea;
+
+  if (ls->eval_fac == 0.0f) {
+    return false;
+  }
+
+  float2 uv = map_to_sphere(ls->Ng);
+  ls->u = uv.x;
+  ls->v = uv.y;
+
+  /* compute pdf */
+  if (ls->t != FLT_MAX) {
+    ls->pdf *= lamp_light_pdf(lighN, -ls->D, ls->t);
+  }
+  else {
+    ls->pdf = 0.f;
+  }
+
+  return true;
+}
+
+template<bool in_volume_segment>
+ccl_device_forceinline bool point_light_tree_parameters(const ccl_global KernelLight *klight,
+                                                        const float3 centroid,
+                                                        const float3 P,
+                                                        ccl_private float &cos_theta_u,
+                                                        ccl_private float2 &distance,
+                                                        ccl_private float3 &point_to_centroid)
+{
+  if (in_volume_segment) {
+    cos_theta_u = 1.0f; /* Any value in [-1, 1], irrelevant since theta = 0 */
+    return true;
+  }
+  float min_distance;
+  point_to_centroid = safe_normalize_len(centroid - P, &min_distance);
+
+  const float radius = klight->spot.radius;
+  const float hypotenus = sqrtf(sqr(radius) + sqr(min_distance));
+  cos_theta_u = min_distance / hypotenus;
+
+  distance = make_float2(hypotenus, min_distance);
+
+  return true;
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -6,8 +6,13 @@
 #include "kernel/integrator/path_state.h"
 #include "kernel/integrator/surface_shader.h"

+#include "kernel/light/distribution.h"
 #include "kernel/light/light.h"

+#ifdef __LIGHT_TREE__
+#  include "kernel/light/tree.h"
+#endif
+
 #include "kernel/sample/mapping.h"
 #include "kernel/sample/mis.h"

@@ -277,6 +282,8 @@ ccl_device_inline void light_sample_to_volume_shadow_ray(
  shadow_ray_setup(sd, ls, P, ray, false);
 }

+/* Multiple importance sampling weights. */
+
 ccl_device_inline float light_sample_mis_weight_forward(KernelGlobals kg,
                                                        const float forward_pdf,
                                                        const float nee_pdf)
@@ -309,4 +316,333 @@ ccl_device_inline float light_sample_mis_weight_nee(KernelGlobals kg,
    return power_heuristic(nee_pdf, forward_pdf);
 }

+/* Next event estimation sampling.
+ *
+ * Sample a position on a light in the scene, from a position on a surface or
+ * from a volume segment.
+ *
+ * Uses either a flat distribution or light tree. */
+
+ccl_device_inline bool light_sample_from_volume_segment(KernelGlobals kg,
+                                                        float randu,
+                                                        float randv,
+                                                        const float time,
+                                                        const float3 P,
+                                                        const float3 D,
+                                                        const float t,
+                                                        const int bounce,
+                                                        const uint32_t path_flag,
+                                                        ccl_private LightSample *ls)
+{
+  /* Select an emitter. */
+  int emitter_object = 0;
+  int emitter_prim = 0;
+  int emitter_shader_flag = 0;
+  float emitter_pdf_selection = 0.0f;
+
+#ifdef __LIGHT_TREE__
+  if (kernel_data.integrator.use_light_tree) {
+    if (!light_tree_sample<true>(kg,
+                                 randu,
+                                 randv,
+                                 time,
+                                 P,
+                                 D,
+                                 t,
+                                 SD_BSDF_HAS_TRANSMISSION,
+                                 bounce,
+                                 path_flag,
+                                 emitter_object,
+                                 emitter_prim,
+                                 emitter_shader_flag,
+                                 emitter_pdf_selection)) {
+      return false;
+    }
+  }
+  else
+#endif
+  {
+    if (!light_distribution_sample(kg,
+                                   randu,
+                                   randv,
+                                   time,
+                                   P,
+                                   bounce,
+                                   path_flag,
+                                   emitter_object,
+                                   emitter_prim,
+                                   emitter_shader_flag,
+                                   emitter_pdf_selection)) {
+      return false;
+    }
+  }
+
+  /* Set first, triangle light sampling from flat distribution will override. */
+  ls->pdf_selection = emitter_pdf_selection;
+
+  /* Sample a point on the chosen emitter. */
+  if (emitter_prim >= 0) {
+    /* Mesh light. */
+    /* Exclude synthetic meshes from shadow catcher pass. */
+    if ((path_flag & PATH_RAY_SHADOW_CATCHER_PASS) &&
+        !(kernel_data_fetch(object_flag, emitter_object) & SD_OBJECT_SHADOW_CATCHER)) {
+      return false;
+    }
+
+    if (!triangle_light_sample<true>(
+            kg, emitter_prim, emitter_object, randu, randv, time, ls, P)) {
+      return false;
+    }
+  }
+  else {
+    /* Light object. */
+    const int lamp = ~emitter_prim;
+
+    if (UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
+      return false;
+    }
+
+    if (!light_sample<true>(kg, lamp, randu, randv, P, path_flag, ls)) {
+      return false;
+    }
+  }
+
+  ls->pdf *= ls->pdf_selection;
+  ls->shader |= emitter_shader_flag;
+
+  return (ls->pdf > 0);
+}
+
+ccl_device bool light_sample_from_position(KernelGlobals kg,
+                                           ccl_private const RNGState *rng_state,
+                                           float randu,
+                                           float randv,
+                                           const float time,
+                                           const float3 P,
+                                           const float3 N,
+                                           const int shader_flags,
+                                           const int bounce,
+                                           const uint32_t path_flag,
+                                           ccl_private LightSample *ls)
+{
+  /* Select an emitter. */
+  int emitter_object = 0;
+  int emitter_prim = 0;
+  int emitter_shader_flag = 0;
+  float emitter_pdf_selection = 0.0f;
+
+#ifdef __LIGHT_TREE__
+  if (kernel_data.integrator.use_light_tree) {
+    if (!light_tree_sample<false>(kg,
+                                  randu,
+                                  randv,
+                                  time,
+                                  P,
+                                  N,
+                                  0,
+                                  shader_flags,
+                                  bounce,
+                                  path_flag,
+                                  emitter_object,
+                                  emitter_prim,
+                                  emitter_shader_flag,
+                                  emitter_pdf_selection)) {
+      return false;
+    }
+  }
+  else
+#endif
+  {
+    if (!light_distribution_sample(kg,
+                                   randu,
+                                   randv,
+                                   time,
+                                   P,
+                                   bounce,
+                                   path_flag,
+                                   emitter_object,
+                                   emitter_prim,
+                                   emitter_shader_flag,
+                                   emitter_pdf_selection)) {
+      return false;
+    }
+  }
+
+  /* Set first, triangle light sampling from flat distribution will override. */
+  ls->pdf_selection = emitter_pdf_selection;
+
+  /* Sample a point on the chosen emitter.
+   * TODO: deduplicate code with light_sample_from_volume_segment? */
+  if (emitter_prim >= 0) {
+    /* Mesh light. */
+    /* Exclude synthetic meshes from shadow catcher pass. */
+    if ((path_flag & PATH_RAY_SHADOW_CATCHER_PASS) &&
+        !(kernel_data_fetch(object_flag, emitter_object) & SD_OBJECT_SHADOW_CATCHER)) {
+      return false;
+    }
+
+    if (!triangle_light_sample<false>(
+            kg, emitter_prim, emitter_object, randu, randv, time, ls, P)) {
+      return false;
+    }
+  }
+  else {
+    /* Light object. */
+    const int lamp = ~emitter_prim;
+
+    if (UNLIKELY(light_select_reached_max_bounces(kg, lamp, bounce))) {
+      return false;
+    }
+
+    if (!light_sample<false>(kg, lamp, randu, randv, P, path_flag, ls)) {
+      return false;
+    }
+  }
+
+  ls->pdf *= ls->pdf_selection;
+  ls->shader |= emitter_shader_flag;
+
+  return (ls->pdf > 0);
+}
+
+ccl_device_inline bool light_sample_new_position(KernelGlobals kg,
+                                                 const float randu,
+                                                 const float randv,
+                                                 const float time,
+                                                 const float3 P,
+                                                 ccl_private LightSample *ls)
+{
+  /* Sample a new position on the same light, for volume sampling. */
+  if (ls->type == LIGHT_TRIANGLE) {
+    if (!triangle_light_sample<false>(kg, ls->prim, ls->object, randu, randv, time, ls, P)) {
+      return false;
+    }
+
+#ifdef __LIGHT_TREE__
+    if (kernel_data.integrator.use_light_tree) {
+      ls->pdf *= ls->pdf_selection;
+    }
+    else
+#endif
+    {
+      /* Handled in triangle_light_sample for efficiency. */
+    }
+    return true;
+  }
+  else {
+    if (!light_sample<false>(kg, ls->lamp, randu, randv, P, 0, ls)) {
+      return false;
+    }
+    ls->pdf *= ls->pdf_selection;
+    return true;
+  }
+}
+
+ccl_device_forceinline void light_sample_update_position(KernelGlobals kg,
+                                                         ccl_private LightSample *ls,
+                                                         const float3 P)
+{
+  /* Update light sample for new shading point position, while keeping
+   * position on the light fixed. */
+
+  /* NOTE : preserve pdf in area measure. */
+  light_update_position(kg, ls, P);
+
+  /* Re-apply already computed selection pdf. */
+  ls->pdf *= ls->pdf_selection;
+}
+
+/* Forward sampling.
+ *
+ * Multiple importance sampling weights for hitting surface, light or background
+ * through indirect light ray.
+ *
+ * The BSDF or phase pdf from the previous bounce was stored in mis_ray_pdf and
+ * is used for balancing with the light sampling pdf. */
+
+ccl_device_inline float light_sample_mis_weight_forward_surface(KernelGlobals kg,
+                                                                IntegratorState state,
+                                                                const uint32_t path_flag,
+                                                                const ccl_private ShaderData *sd)
+{
+  const float bsdf_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
+  const float t = sd->ray_length;
+  float pdf = triangle_light_pdf(kg, sd, t);
+
+  /* Light selection pdf. */
+#ifdef __LIGHT_TREE__
+  if (kernel_data.integrator.use_light_tree) {
+    float3 ray_P = INTEGRATOR_STATE(state, ray, P);
+    const float3 N = INTEGRATOR_STATE(state, path, mis_origin_n);
+    uint lookup_offset = kernel_data_fetch(object_lookup_offset, sd->object);
+    uint prim_offset = kernel_data_fetch(object_prim_offset, sd->object);
+    pdf *= light_tree_pdf(kg, ray_P, N, path_flag, sd->prim - prim_offset + lookup_offset);
+  }
+  else
+#endif
+  {
+    /* Handled in triangle_light_pdf for efficiency. */
+  }
+
+  return light_sample_mis_weight_forward(kg, bsdf_pdf, pdf);
+}
+
+ccl_device_inline float light_sample_mis_weight_forward_lamp(KernelGlobals kg,
+                                                             IntegratorState state,
+                                                             const uint32_t path_flag,
+                                                             const ccl_private LightSample *ls,
+                                                             const float3 P)
+{
+  const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
+  float pdf = ls->pdf;
+
+  /* Light selection pdf. */
+#ifdef __LIGHT_TREE__
+  if (kernel_data.integrator.use_light_tree) {
+    const float3 N = INTEGRATOR_STATE(state, path, mis_origin_n);
+    pdf *= light_tree_pdf(kg, P, N, path_flag, ~ls->lamp);
+  }
+  else
+#endif
+  {
+    pdf *= light_distribution_pdf_lamp(kg);
+  }
+
+  return light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
+}
+
+ccl_device_inline float light_sample_mis_weight_forward_distant(KernelGlobals kg,
+                                                                IntegratorState state,
+                                                                const uint32_t path_flag,
+                                                                const ccl_private LightSample *ls)
+{
+  const float3 ray_P = INTEGRATOR_STATE(state, ray, P);
+  return light_sample_mis_weight_forward_lamp(kg, state, path_flag, ls, ray_P);
+}
+
+ccl_device_inline float light_sample_mis_weight_forward_background(KernelGlobals kg,
+                                                                   IntegratorState state,
+                                                                   const uint32_t path_flag)
+{
+  const float3 ray_P = INTEGRATOR_STATE(state, ray, P);
+  const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
+  const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
+
+  float pdf = background_light_pdf(kg, ray_P, ray_D);
+
+  /* Light selection pdf. */
+#ifdef __LIGHT_TREE__
+  if (kernel_data.integrator.use_light_tree) {
+    const float3 N = INTEGRATOR_STATE(state, path, mis_origin_n);
+    pdf *= light_tree_pdf(kg, ray_P, N, path_flag, ~kernel_data.background.light_index);
+  }
+  else
+#endif
+  {
+    pdf *= light_distribution_pdf_lamp(kg);
+  }
+
+  return light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/light/spot.h
+++ b/intern/cycles/kernel/light/spot.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/light/common.h"
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device float spot_light_attenuation(float3 dir,
+                                        float cos_half_spot_angle,
+                                        float spot_smooth,
+                                        float3 N)
+{
+  float attenuation = dot(dir, N);
+
+  if (attenuation <= cos_half_spot_angle) {
+    attenuation = 0.0f;
+  }
+  else {
+    float t = attenuation - cos_half_spot_angle;
+
+    if (t < spot_smooth && spot_smooth != 0.0f)
+      attenuation *= smoothstepf(t / spot_smooth);
+  }
+
+  return attenuation;
+}
+
+template<bool in_volume_segment>
+ccl_device_inline bool spot_light_sample(const ccl_global KernelLight *klight,
+                                         const float randu,
+                                         const float randv,
+                                         const float3 P,
+                                         ccl_private LightSample *ls)
+{
+  ls->P = klight->co;
+
+  const float3 center = klight->co;
+  const float radius = klight->spot.radius;
+  /* disk oriented normal */
+  const float3 lightN = normalize(P - center);
+  ls->P = center;
+
+  if (radius > 0.0f) {
+    /* disk light */
+    ls->P += disk_light_sample(lightN, randu, randv) * radius;
+  }
+
+  const float invarea = klight->spot.invarea;
+  ls->pdf = invarea;
+
+  ls->D = normalize_len(ls->P - P, &ls->t);
+  /* we set the light normal to the outgoing direction to support texturing */
+  ls->Ng = -ls->D;
+
+  ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
+
+  /* spot light attenuation */
+  ls->eval_fac *= spot_light_attenuation(
+      klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, -ls->D);
+  if (!in_volume_segment && ls->eval_fac == 0.0f) {
+    return false;
+  }
+
+  float2 uv = map_to_sphere(ls->Ng);
+  ls->u = uv.x;
+  ls->v = uv.y;
+
+  ls->pdf *= lamp_light_pdf(lightN, -ls->D, ls->t);
+  return true;
+}
+
+ccl_device_forceinline void spot_light_update_position(const ccl_global KernelLight *klight,
+                                                       ccl_private LightSample *ls,
+                                                       const float3 P)
+{
+  ls->D = normalize_len(ls->P - P, &ls->t);
+  ls->Ng = -ls->D;
+
+  float2 uv = map_to_sphere(ls->Ng);
+  ls->u = uv.x;
+  ls->v = uv.y;
+
+  float invarea = klight->spot.invarea;
+  ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
+  ls->pdf = invarea;
+
+  /* spot light attenuation */
+  ls->eval_fac *= spot_light_attenuation(
+      klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, ls->Ng);
+}
+
+ccl_device_inline bool spot_light_intersect(const ccl_global KernelLight *klight,
+                                            const ccl_private Ray *ccl_restrict ray,
+                                            ccl_private float *t)
+{
+  /* Spot/Disk light. */
+  const float3 lightP = klight->co;
+  const float radius = klight->spot.radius;
+  if (radius == 0.0f) {
+    return false;
+  }
+  /* disk oriented normal */
+  const float3 lightN = normalize(ray->P - lightP);
+  /* One sided. */
+  if (dot(ray->D, lightN) >= 0.0f) {
+    return false;
+  }
+
+  float3 P;
+  return ray_disk_intersect(ray->P, ray->D, ray->tmin, ray->tmax, lightP, lightN, radius, &P, t);
+}
+
+ccl_device_inline bool spot_light_sample_from_intersection(
+    const ccl_global KernelLight *klight,
+    ccl_private const Intersection *ccl_restrict isect,
+    const float3 ray_P,
+    const float3 ray_D,
+    ccl_private LightSample *ccl_restrict ls)
+{
+  /* the normal of the oriented disk */
+  const float3 lightN = normalize(ray_P - klight->co);
+  /* We set the light normal to the outgoing direction to support texturing. */
+  ls->Ng = -ls->D;
+
+  float invarea = klight->spot.invarea;
+  ls->eval_fac = (0.25f * M_1_PI_F) * invarea;
+  ls->pdf = invarea;
+
+  /* spot light attenuation */
+  ls->eval_fac *= spot_light_attenuation(
+      klight->spot.dir, klight->spot.cos_half_spot_angle, klight->spot.spot_smooth, -ls->D);
+
+  if (ls->eval_fac == 0.0f) {
+    return false;
+  }
+
+  float2 uv = map_to_sphere(ls->Ng);
+  ls->u = uv.x;
+  ls->v = uv.y;
+
+  /* compute pdf */
+  if (ls->t != FLT_MAX) {
+    ls->pdf *= lamp_light_pdf(lightN, -ls->D, ls->t);
+  }
+  else {
+    ls->pdf = 0.f;
+  }
+
+  return true;
+}
+
+template<bool in_volume_segment>
+ccl_device_forceinline bool spot_light_tree_parameters(const ccl_global KernelLight *klight,
+                                                       const float3 centroid,
+                                                       const float3 P,
+                                                       ccl_private float &cos_theta_u,
+                                                       ccl_private float2 &distance,
+                                                       ccl_private float3 &point_to_centroid)
+{
+  float min_distance;
+  const float3 point_to_centroid_ = safe_normalize_len(centroid - P, &min_distance);
+
+  const float radius = klight->spot.radius;
+  const float hypotenus = sqrtf(sqr(radius) + sqr(min_distance));
+  cos_theta_u = min_distance / hypotenus;
+
+  if (in_volume_segment) {
+    return true;
+  }
+
+  distance = make_float2(hypotenus, min_distance);
+  point_to_centroid = point_to_centroid_;
+
+  return true;
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/light/tree.h
+++ b/intern/cycles/kernel/light/tree.h
@@ -0,0 +1,691 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/light/area.h"
+#include "kernel/light/common.h"
+#include "kernel/light/light.h"
+#include "kernel/light/spot.h"
+#include "kernel/light/triangle.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* TODO: this seems like a relative expensive computation, and we can make it a lot cheaper
+ * by using a bounding sphere instead of a bounding box. This will be more inaccurate, but it
+ * might be fine when used along with the adaptive splitting. */
+ccl_device float light_tree_cos_bounding_box_angle(const BoundingBox bbox,
+                                                   const float3 P,
+                                                   const float3 point_to_centroid)
+{
+  if (P.x > bbox.min.x && P.y > bbox.min.y && P.z > bbox.min.z && P.x < bbox.max.x &&
+      P.y < bbox.max.y && P.z < bbox.max.z) {
+    /* If P is inside the bbox, `theta_u` covers the whole sphere */
+    return -1.0f;
+  }
+  float cos_theta_u = 1.0f;
+  /* Iterate through all 8 possible points of the bounding box. */
+  for (int i = 0; i < 8; ++i) {
+    const float3 corner = make_float3((i & 1) ? bbox.max.x : bbox.min.x,
+                                      (i & 2) ? bbox.max.y : bbox.min.y,
+                                      (i & 4) ? bbox.max.z : bbox.min.z);
+
+    /* Caculate the bounding box angle. */
+    float3 point_to_corner = normalize(corner - P);
+    cos_theta_u = fminf(cos_theta_u, dot(point_to_centroid, point_to_corner));
+  }
+  return cos_theta_u;
+}
+
+ccl_device_forceinline float sin_from_cos(const float c)
+{
+  return safe_sqrtf(1.0f - sqr(c));
+}
+
+/* Compute vector v as in Fig .8. P_v is the corresponding point along the ray ccl_device float3 */
+ccl_device float3 compute_v(
+    const float3 centroid, const float3 P, const float3 D, const float3 bcone_axis, const float t)
+{
+  const float3 unnormalized_v0 = P - centroid;
+  float len_v0;
+  const float3 unnormalized_v1 = unnormalized_v0 + D * fminf(t, 1e12f);
+  const float3 v0 = normalize_len(unnormalized_v0, &len_v0);
+  const float3 v1 = normalize(unnormalized_v1);
+
+  const float3 o0 = v0;
+  float3 o1, o2;
+  make_orthonormals_tangent(o0, v1, &o1, &o2);
+
+  const float dot_o0_a = dot(o0, bcone_axis);
+  const float dot_o1_a = dot(o1, bcone_axis);
+  const float cos_phi0 = dot_o0_a / sqrtf(sqr(dot_o0_a) + sqr(dot_o1_a));
+
+  return (dot_o1_a < 0 || dot(v0, v1) > cos_phi0) ? (dot_o0_a > dot(v1, bcone_axis) ? v0 : v1) :
+                                                    cos_phi0 * o0 + sin_from_cos(cos_phi0) * o1;
+}
+
+/* This is the general function for calculating the importance of either a cluster or an emitter.
+ * Both of the specialized functions obtain the necessary data before calling this function. */
+template<bool in_volume_segment>
+ccl_device void light_tree_importance(const float3 N_or_D,
+                                      const bool has_transmission,
+                                      const float3 point_to_centroid,
+                                      const float cos_theta_u,
+                                      const BoundingCone bcone,
+                                      const float max_distance,
+                                      const float min_distance,
+                                      const float t,
+                                      const float energy,
+                                      ccl_private float &max_importance,
+                                      ccl_private float &min_importance)
+{
+  max_importance = 0.0f;
+  min_importance = 0.0f;
+
+  const float sin_theta_u = sin_from_cos(cos_theta_u);
+
+  /* cos(theta_i') in the paper, omitted for volume */
+  float cos_min_incidence_angle = 1.0f;
+  float cos_max_incidence_angle = 1.0f;
+
+  /* when sampling the light tree for the second time in `shade_volume.h` and when query the pdf in
+   * `sample.h` */
+  const bool in_volume = is_zero(N_or_D);
+  if (!in_volume_segment && !in_volume) {
+    const float3 N = N_or_D;
+    const float cos_theta_i = has_transmission ? fabsf(dot(point_to_centroid, N)) :
+                                                 dot(point_to_centroid, N);
+    const float sin_theta_i = sin_from_cos(cos_theta_i);
+
+    /* cos_min_incidence_angle = cos(max{theta_i - theta_u, 0}) = cos(theta_i') in the paper */
+    cos_min_incidence_angle = cos_theta_i >= cos_theta_u ?
+                                  1.0f :
+                                  cos_theta_i * cos_theta_u + sin_theta_i * sin_theta_u;
+
+    /* If the node is guaranteed to be behind the surface we're sampling, and the surface is
+     * opaque, then we can give the node an importance of 0 as it contributes nothing to the
+     * surface. This is more accurate than the bbox test if we are calculating the importance of
+     * an emitter with radius */
+    if (!has_transmission && cos_min_incidence_angle < 0) {
+      return;
+    }
+
+    /* cos_max_incidence_angle = cos(min{theta_i + theta_u, pi}) */
+    cos_max_incidence_angle = fmaxf(cos_theta_i * cos_theta_u - sin_theta_i * sin_theta_u, 0.0f);
+  }
+
+  /* cos(theta - theta_u) */
+  const float cos_theta = dot(bcone.axis, -point_to_centroid);
+  const float sin_theta = sin_from_cos(cos_theta);
+  const float cos_theta_minus_theta_u = cos_theta * cos_theta_u + sin_theta * sin_theta_u;
+
+  float cos_theta_o, sin_theta_o;
+  fast_sincosf(bcone.theta_o, &sin_theta_o, &cos_theta_o);
+
+  /* minimum angle an emitter’s axis would form with the direction to the shading point,
+   * cos(theta') in the paper */
+  float cos_min_outgoing_angle;
+  if ((cos_theta >= cos_theta_u) || (cos_theta_minus_theta_u >= cos_theta_o)) {
+    /* theta - theta_o - theta_u <= 0 */
+    kernel_assert((fast_acosf(cos_theta) - bcone.theta_o - fast_acosf(cos_theta_u)) < 5e-4f);
+    cos_min_outgoing_angle = 1.0f;
+  }
+  else if ((bcone.theta_o + bcone.theta_e > M_PI_F) ||
+           (cos_theta_minus_theta_u > cos(bcone.theta_o + bcone.theta_e))) {
+    /* theta' = theta - theta_o - theta_u < theta_e */
+    kernel_assert(
+        (fast_acosf(cos_theta) - bcone.theta_o - fast_acosf(cos_theta_u) - bcone.theta_e) < 5e-4f);
+    const float sin_theta_minus_theta_u = sin_from_cos(cos_theta_minus_theta_u);
+    cos_min_outgoing_angle = cos_theta_minus_theta_u * cos_theta_o +
+                             sin_theta_minus_theta_u * sin_theta_o;
+  }
+  else {
+    /* cluster invisible */
+    return;
+  }
+
+  /* TODO: find a good approximation for f_a. */
+  const float f_a = 1.0f;
+  /* TODO: also consider t (or theta_a, theta_b) for volume */
+  max_importance = fabsf(f_a * cos_min_incidence_angle * energy * cos_min_outgoing_angle /
+                         (in_volume_segment ? min_distance : sqr(min_distance)));
+
+  /* TODO: also min importance for volume? */
+  if (in_volume_segment) {
+    min_importance = max_importance;
+    return;
+  }
+
+  /* cos(theta + theta_o + theta_u) if theta + theta_o + theta_u < theta_e, 0 otherwise */
+  float cos_max_outgoing_angle;
+  const float cos_theta_plus_theta_u = cos_theta * cos_theta_u - sin_theta * sin_theta_u;
+  if (bcone.theta_e - bcone.theta_o < 0 || cos_theta < 0 || cos_theta_u < 0 ||
+      cos_theta_plus_theta_u < cos(bcone.theta_e - bcone.theta_o)) {
+    min_importance = 0.0f;
+  }
+  else {
+    const float sin_theta_plus_theta_u = sin_from_cos(cos_theta_plus_theta_u);
+    cos_max_outgoing_angle = cos_theta_plus_theta_u * cos_theta_o -
+                             sin_theta_plus_theta_u * sin_theta_o;
+    min_importance = fabsf(f_a * cos_max_incidence_angle * energy * cos_max_outgoing_angle /
+                           sqr(max_distance));
+  }
+}
+
+template<bool in_volume_segment>
+ccl_device bool compute_emitter_centroid_and_dir(KernelGlobals kg,
+                                                 ccl_global const KernelLightTreeEmitter *kemitter,
+                                                 const float3 P,
+                                                 ccl_private float3 &centroid,
+                                                 ccl_private packed_float3 &dir)
+{
+  const int prim_id = kemitter->prim_id;
+  if (prim_id < 0) {
+    const ccl_global KernelLight *klight = &kernel_data_fetch(lights, ~prim_id);
+    centroid = klight->co;
+
+    switch (klight->type) {
+      case LIGHT_SPOT:
+        dir = klight->spot.dir;
+        break;
+      case LIGHT_POINT:
+        /* Disk-oriented normal */
+        dir = safe_normalize(P - centroid);
+        break;
+      case LIGHT_AREA:
+        dir = klight->area.dir;
+        break;
+      case LIGHT_BACKGROUND:
+        /* Aarbitrary centroid and direction */
+        centroid = make_float3(0.0f, 0.0f, 1.0f);
+        dir = make_float3(0.0f, 0.0f, -1.0f);
+        return !in_volume_segment;
+      case LIGHT_DISTANT:
+        dir = centroid;
+        return !in_volume_segment;
+      default:
+        return false;
+    }
+  }
+  else {
+    const int object = kemitter->mesh_light.object_id;
+    float3 vertices[3];
+    triangle_world_space_vertices(kg, object, prim_id, -1.0f, vertices);
+    centroid = (vertices[0] + vertices[1] + vertices[2]) / 3.0f;
+
+    if (kemitter->mesh_light.emission_sampling == EMISSION_SAMPLING_FRONT) {
+      dir = safe_normalize(cross(vertices[1] - vertices[0], vertices[2] - vertices[0]));
+    }
+    else if (kemitter->mesh_light.emission_sampling == EMISSION_SAMPLING_BACK) {
+      dir = -safe_normalize(cross(vertices[1] - vertices[0], vertices[2] - vertices[0]));
+    }
+    else {
+      /* Double sided: any vector in the plane. */
+      dir = safe_normalize(vertices[0] - vertices[1]);
+    }
+  }
+  return true;
+}
+
+template<bool in_volume_segment>
+ccl_device void light_tree_emitter_importance(KernelGlobals kg,
+                                              const float3 P,
+                                              const float3 N_or_D,
+                                              const float t,
+                                              const bool has_transmission,
+                                              int emitter_index,
+                                              ccl_private float &max_importance,
+                                              ccl_private float &min_importance)
+{
+  const ccl_global KernelLightTreeEmitter *kemitter = &kernel_data_fetch(light_tree_emitters,
+                                                                         emitter_index);
+
+  max_importance = 0.0f;
+  min_importance = 0.0f;
+  BoundingCone bcone;
+  bcone.theta_o = kemitter->theta_o;
+  bcone.theta_e = kemitter->theta_e;
+  float cos_theta_u;
+  float2 distance; /* distance.x = max_distance, distance.y = mix_distance */
+  float3 centroid, point_to_centroid, P_c;
+
+  if (!compute_emitter_centroid_and_dir<in_volume_segment>(
+          kg, kemitter, P, centroid, bcone.axis)) {
+    return;
+  }
+
+  const int prim_id = kemitter->prim_id;
+
+  if (in_volume_segment) {
+    const float3 D = N_or_D;
+    /* Closest point */
+    P_c = P + dot(centroid - P, D) * D;
+    /* minimal distance of the ray to the cluster */
+    distance.x = len(centroid - P_c);
+    distance.y = distance.x;
+    point_to_centroid = -compute_v(centroid, P, D, bcone.axis, t);
+  }
+  else {
+    P_c = P;
+  }
+
+  bool is_visible;
+  if (prim_id < 0) {
+    const ccl_global KernelLight *klight = &kernel_data_fetch(lights, ~prim_id);
+    switch (klight->type) {
+      /* Function templates only modifies cos_theta_u when in_volume_segment = true */
+      case LIGHT_SPOT:
+        is_visible = spot_light_tree_parameters<in_volume_segment>(
+            klight, centroid, P_c, cos_theta_u, distance, point_to_centroid);
+        break;
+      case LIGHT_POINT:
+        is_visible = point_light_tree_parameters<in_volume_segment>(
+            klight, centroid, P_c, cos_theta_u, distance, point_to_centroid);
+        bcone.theta_o = 0.0f;
+        break;
+      case LIGHT_AREA:
+        is_visible = area_light_tree_parameters<in_volume_segment>(
+            klight, centroid, P_c, N_or_D, bcone.axis, cos_theta_u, distance, point_to_centroid);
+        break;
+      case LIGHT_BACKGROUND:
+        is_visible = background_light_tree_parameters(
+            centroid, cos_theta_u, distance, point_to_centroid);
+        break;
+      case LIGHT_DISTANT:
+        is_visible = distant_light_tree_parameters(
+            centroid, bcone.theta_e, cos_theta_u, distance, point_to_centroid);
+        break;
+      default:
+        return;
+    }
+  }
+  else { /* mesh light */
+    is_visible = triangle_light_tree_parameters<in_volume_segment>(
+        kg, kemitter, centroid, P_c, N_or_D, bcone, cos_theta_u, distance, point_to_centroid);
+  }
+
+  is_visible |= has_transmission;
+  if (!is_visible) {
+    return;
+  }
+
+  light_tree_importance<in_volume_segment>(N_or_D,
+                                           has_transmission,
+                                           point_to_centroid,
+                                           cos_theta_u,
+                                           bcone,
+                                           distance.x,
+                                           distance.y,
+                                           t,
+                                           kemitter->energy,
+                                           max_importance,
+                                           min_importance);
+}
+
+template<bool in_volume_segment>
+ccl_device void light_tree_node_importance(KernelGlobals kg,
+                                           const float3 P,
+                                           const float3 N_or_D,
+                                           const float t,
+                                           const bool has_transmission,
+                                           const ccl_global KernelLightTreeNode *knode,
+                                           ccl_private float &max_importance,
+                                           ccl_private float &min_importance)
+{
+  max_importance = 0.0f;
+  min_importance = 0.0f;
+  if (knode->num_prims == 1) {
+    /* At a leaf node with only one emitter */
+    light_tree_emitter_importance<in_volume_segment>(
+        kg, P, N_or_D, t, has_transmission, -knode->child_index, max_importance, min_importance);
+  }
+  else if (knode->num_prims != 0) {
+    const BoundingCone bcone = knode->bcone;
+    const BoundingBox bbox = knode->bbox;
+
+    float3 point_to_centroid;
+    float cos_theta_u;
+    float distance;
+    if (knode->bit_trail == 1) {
+      /* distant light node */
+      if (in_volume_segment) {
+        return;
+      }
+      point_to_centroid = -bcone.axis;
+      cos_theta_u = fast_cosf(bcone.theta_o);
+      distance = 1.0f;
+    }
+    else {
+      const float3 centroid = 0.5f * (bbox.min + bbox.max);
+
+      if (in_volume_segment) {
+        const float3 D = N_or_D;
+        const float3 closest_point = P + dot(centroid - P, D) * D;
+        /* minimal distance of the ray to the cluster */
+        distance = len(centroid - closest_point);
+        point_to_centroid = -compute_v(centroid, P, D, bcone.axis, t);
+        cos_theta_u = light_tree_cos_bounding_box_angle(bbox, closest_point, point_to_centroid);
+      }
+      else {
+        const float3 N = N_or_D;
+        const float3 bbox_extent = bbox.max - centroid;
+        const bool bbox_is_visible = has_transmission |
+                                     (dot(N, centroid - P) + dot(fabs(N), fabs(bbox_extent)) > 0);
+
+        /* If the node is guaranteed to be behind the surface we're sampling, and the surface is
+         * opaque, then we can give the node an importance of 0 as it contributes nothing to the
+         * surface. */
+        if (!bbox_is_visible) {
+          return;
+        }
+
+        point_to_centroid = normalize_len(centroid - P, &distance);
+        cos_theta_u = light_tree_cos_bounding_box_angle(bbox, P, point_to_centroid);
+      }
+      /* clamp distance to half the radius of the cluster when splitting is disabled */
+      distance = fmaxf(0.5f * len(centroid - bbox.max), distance);
+    }
+    /* TODO: currently max_distance = min_distance, max_importance = min_importance for the
+     * nodes. Do we need better weights for complex scenes? */
+    light_tree_importance<in_volume_segment>(N_or_D,
+                                             has_transmission,
+                                             point_to_centroid,
+                                             cos_theta_u,
+                                             bcone,
+                                             distance,
+                                             distance,
+                                             t,
+                                             knode->energy,
+                                             max_importance,
+                                             min_importance);
+  }
+}
+
+ccl_device void sample_resevoir(const int current_index,
+                                const float current_weight,
+                                ccl_private int &selected_index,
+                                ccl_private float &selected_weight,
+                                ccl_private float &total_weight,
+                                ccl_private float &rand)
+{
+  if (current_weight == 0.0f) {
+    return;
+  }
+  total_weight += current_weight;
+  float thresh = current_weight / total_weight;
+  if (rand <= thresh) {
+    selected_index = current_index;
+    selected_weight = current_weight;
+    rand = rand / thresh;
+  }
+  else {
+    rand = (rand - thresh) / (1.0f - thresh);
+  }
+  kernel_assert(rand >= 0.0f && rand <= 1.0f);
+  return;
+}
+
+/* pick an emitter from a leaf node using resevoir sampling, keep two reservoirs for upper and
+ * lower bounds */
+template<bool in_volume_segment>
+ccl_device int light_tree_cluster_select_emitter(KernelGlobals kg,
+                                                 ccl_private float &rand,
+                                                 const float3 P,
+                                                 const float3 N_or_D,
+                                                 const float t,
+                                                 const bool has_transmission,
+                                                 const ccl_global KernelLightTreeNode *knode,
+                                                 ccl_private float *pdf_factor)
+{
+  float selected_importance[2] = {0.0f, 0.0f};
+  float total_importance[2] = {0.0f, 0.0f};
+  int selected_index = -1;
+
+  /* Mark emitters with zero importance. Used for resevoir when total minimum importance = 0 */
+  kernel_assert(knode->num_prims <= sizeof(uint) * 8);
+  uint has_importance = 0;
+
+  const bool sample_max = (rand > 0.5f); /* sampling using the maximum importance */
+  rand = rand * 2.0f - float(sample_max);
+
+  for (int i = 0; i < knode->num_prims; i++) {
+    int current_index = -knode->child_index + i;
+    /* maximum importance = importance[0], mininum importance = importance[1] */
+    float importance[2];
+    light_tree_emitter_importance<in_volume_segment>(
+        kg, P, N_or_D, t, has_transmission, current_index, importance[0], importance[1]);
+
+    sample_resevoir(current_index,
+                    importance[!sample_max],
+                    selected_index,
+                    selected_importance[!sample_max],
+                    total_importance[!sample_max],
+                    rand);
+    if (selected_index == current_index) {
+      selected_importance[sample_max] = importance[sample_max];
+    }
+    total_importance[sample_max] += importance[sample_max];
+
+    has_importance |= ((importance[0] > 0) << i);
+  }
+
+  if (total_importance[0] == 0.0f) {
+    return -1;
+  }
+
+  if (total_importance[1] == 0.0f) {
+    /* uniformly sample emitters with positive maximum importance */
+    if (sample_max) {
+      selected_importance[1] = 1.0f;
+      total_importance[1] = float(popcount(has_importance));
+    }
+    else {
+      selected_index = -1;
+      for (int i = 0; i < knode->num_prims; i++) {
+        int current_index = -knode->child_index + i;
+        sample_resevoir(current_index,
+                        float(has_importance & 1),
+                        selected_index,
+                        selected_importance[1],
+                        total_importance[1],
+                        rand);
+        has_importance >>= 1;
+      }
+
+      float discard;
+      light_tree_emitter_importance<in_volume_segment>(
+          kg, P, N_or_D, t, has_transmission, selected_index, selected_importance[0], discard);
+    }
+  }
+
+  *pdf_factor = 0.5f * (selected_importance[0] / total_importance[0] +
+                        selected_importance[1] / total_importance[1]);
+
+  return selected_index;
+}
+
+template<bool in_volume_segment>
+ccl_device bool get_left_probability(KernelGlobals kg,
+                                     const float3 P,
+                                     const float3 N_or_D,
+                                     const float t,
+                                     const bool has_transmission,
+                                     const int left_index,
+                                     const int right_index,
+                                     ccl_private float &left_probability)
+{
+  const ccl_global KernelLightTreeNode *left = &kernel_data_fetch(light_tree_nodes, left_index);
+  const ccl_global KernelLightTreeNode *right = &kernel_data_fetch(light_tree_nodes, right_index);
+
+  float min_left_importance, max_left_importance, min_right_importance, max_right_importance;
+  light_tree_node_importance<in_volume_segment>(
+      kg, P, N_or_D, t, has_transmission, left, max_left_importance, min_left_importance);
+  light_tree_node_importance<in_volume_segment>(
+      kg, P, N_or_D, t, has_transmission, right, max_right_importance, min_right_importance);
+
+  const float total_max_importance = max_left_importance + max_right_importance;
+  if (total_max_importance == 0.0f) {
+    return false;
+  }
+  const float total_min_importance = min_left_importance + min_right_importance;
+
+  /* average two probabilities of picking the left child node using lower and upper bounds */
+  const float probability_max = max_left_importance / total_max_importance;
+  const float probability_min = total_min_importance > 0 ?
+                                    min_left_importance / total_min_importance :
+                                    0.5f * (float(max_left_importance > 0) +
+                                            float(max_right_importance == 0.0f));
+  left_probability = 0.5f * (probability_max + probability_min);
+  return true;
+}
+
+template<bool in_volume_segment>
+ccl_device_noinline bool light_tree_sample(KernelGlobals kg,
+                                           ccl_private float &randu,
+                                           ccl_private float &randv,
+                                           const float time,
+                                           const float3 P,
+                                           const float3 N_or_D,
+                                           const float t,
+                                           const int shader_flags,
+                                           const int bounce,
+                                           const uint32_t path_flag,
+                                           ccl_private int &emitter_object,
+                                           ccl_private int &emitter_prim,
+                                           ccl_private int &emitter_shader_flag,
+                                           ccl_private float &emitter_pdf_selection)
+{
+  if (!kernel_data.integrator.use_direct_light) {
+    return false;
+  }
+
+  const bool has_transmission = (shader_flags & SD_BSDF_HAS_TRANSMISSION);
+  float pdf_leaf = 1.0f;
+  float pdf_emitter_from_leaf = 1.0f;
+  int selected_light = -1;
+
+  int node_index = 0; /* root node */
+
+  /* Traverse the light tree until a leaf node is reached. */
+  while (true) {
+    const ccl_global KernelLightTreeNode *knode = &kernel_data_fetch(light_tree_nodes, node_index);
+
+    if (knode->child_index <= 0) {
+      /* At a leaf node, we pick an emitter */
+      selected_light = light_tree_cluster_select_emitter<in_volume_segment>(
+          kg, randv, P, N_or_D, t, has_transmission, knode, &pdf_emitter_from_leaf);
+      break;
+    }
+
+    /* At an interior node, the left child is directly after the parent,
+     * while the right child is stored as the child index. */
+    const int left_index = node_index + 1;
+    const int right_index = knode->child_index;
+
+    float left_prob;
+    if (!get_left_probability<in_volume_segment>(
+            kg, P, N_or_D, t, has_transmission, left_index, right_index, left_prob)) {
+      return false; /* both child nodes have zero importance */
+    }
+
+    float discard;
+    float total_prob = left_prob;
+    node_index = left_index;
+    sample_resevoir(right_index, 1.0f - left_prob, node_index, discard, total_prob, randu);
+    pdf_leaf *= (node_index == left_index) ? left_prob : (1.0f - left_prob);
+  }
+
+  if (selected_light < 0) {
+    return false;
+  }
+
+  /* Return info about chosen emitter. */
+  ccl_global const KernelLightTreeEmitter *kemitter = &kernel_data_fetch(light_tree_emitters,
+                                                                         selected_light);
+
+  emitter_object = kemitter->mesh_light.object_id;
+  emitter_prim = kemitter->prim_id;
+  emitter_shader_flag = kemitter->mesh_light.shader_flag;
+  emitter_pdf_selection = pdf_leaf * pdf_emitter_from_leaf;
+
+  return true;
+}
+
+/* We need to be able to find the probability of selecting a given light for MIS. */
+ccl_device float light_tree_pdf(
+    KernelGlobals kg, const float3 P, const float3 N, const int path_flag, const int prim)
+{
+  const bool has_transmission = (path_flag & PATH_RAY_MIS_HAD_TRANSMISSION);
+  /* Target emitter info */
+  const int target_emitter = (prim >= 0) ? kernel_data_fetch(triangle_to_tree, prim) :
+                                           kernel_data_fetch(light_to_tree, ~prim);
+  ccl_global const KernelLightTreeEmitter *kemitter = &kernel_data_fetch(light_tree_emitters,
+                                                                         target_emitter);
+  const int target_leaf = kemitter->parent_index;
+  ccl_global const KernelLightTreeNode *kleaf = &kernel_data_fetch(light_tree_nodes, target_leaf);
+  uint bit_trail = kleaf->bit_trail;
+
+  int node_index = 0; /* root node */
+
+  float pdf = 1.0f;
+
+  /* Traverse the light tree until we reach the target leaf node */
+  while (true) {
+    const ccl_global KernelLightTreeNode *knode = &kernel_data_fetch(light_tree_nodes, node_index);
+
+    if (knode->child_index <= 0) {
+      break;
+    }
+
+    /* Interior node */
+    const int left_index = node_index + 1;
+    const int right_index = knode->child_index;
+
+    float left_prob;
+    if (!get_left_probability<false>(
+            kg, P, N, 0, has_transmission, left_index, right_index, left_prob)) {
+      return 0.0f;
+    }
+
+    const bool go_left = (bit_trail & 1) == 0;
+    bit_trail >>= 1;
+    pdf *= go_left ? left_prob : (1.0f - left_prob);
+    node_index = go_left ? left_index : right_index;
+
+    if (pdf == 0) {
+      return 0.0f;
+    }
+  }
+
+  kernel_assert(node_index == target_leaf);
+
+  /* Iterate through leaf node to find the probability of sampling the target emitter. */
+  float target_max_importance = 0.0f;
+  float target_min_importance = 0.0f;
+  float total_max_importance = 0.0f;
+  float total_min_importance = 0.0f;
+  int num_has_importance = 0;
+  for (int i = 0; i < kleaf->num_prims; i++) {
+    const int emitter = -kleaf->child_index + i;
+    float max_importance, min_importance;
+    light_tree_emitter_importance<false>(
+        kg, P, N, 0, has_transmission, emitter, max_importance, min_importance);
+    num_has_importance += (max_importance > 0);
+    if (emitter == target_emitter) {
+      target_max_importance = max_importance;
+      target_min_importance = min_importance;
+    }
+    total_max_importance += max_importance;
+    total_min_importance += min_importance;
+  }
+
+  if (target_max_importance > 0.0f) {
+    return pdf * 0.5f *
+           (target_max_importance / total_max_importance +
+            (total_min_importance > 0 ? target_min_importance / total_min_importance :
+                                        1.0f / num_has_importance));
+  }
+  return 0.0f;
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/light/triangle.h
+++ b/intern/cycles/kernel/light/triangle.h
@@ -0,0 +1,329 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "kernel/geom/geom.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* returns true if the triangle is has motion blur or an instancing transform applied */
+ccl_device_inline bool triangle_world_space_vertices(
+    KernelGlobals kg, int object, int prim, float time, float3 V[3])
+{
+  bool has_motion = false;
+  const int object_flag = kernel_data_fetch(object_flag, object);
+
+  if (object_flag & SD_OBJECT_HAS_VERTEX_MOTION && time >= 0.0f) {
+    motion_triangle_vertices(kg, object, prim, time, V);
+    has_motion = true;
+  }
+  else {
+    triangle_vertices(kg, prim, V);
+  }
+
+  if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
+#ifdef __OBJECT_MOTION__
+    float object_time = (time >= 0.0f) ? time : 0.5f;
+    Transform tfm = object_fetch_transform_motion_test(kg, object, object_time, NULL);
+#else
+    Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
+#endif
+    V[0] = transform_point(&tfm, V[0]);
+    V[1] = transform_point(&tfm, V[1]);
+    V[2] = transform_point(&tfm, V[2]);
+    has_motion = true;
+  }
+  return has_motion;
+}
+
+ccl_device_inline float triangle_light_pdf_area_sampling(const float3 Ng, const float3 I, float t)
+{
+  float cos_pi = fabsf(dot(Ng, I));
+
+  if (cos_pi == 0.0f)
+    return 0.0f;
+
+  return t * t / cos_pi;
+}
+
+ccl_device_forceinline float triangle_light_pdf(KernelGlobals kg,
+                                                ccl_private const ShaderData *sd,
+                                                float t)
+{
+  /* A naive heuristic to decide between costly solid angle sampling
+   * and simple area sampling, comparing the distance to the triangle plane
+   * to the length of the edges of the triangle. */
+
+  float3 V[3];
+  bool has_motion = triangle_world_space_vertices(kg, sd->object, sd->prim, sd->time, V);
+
+  const float3 e0 = V[1] - V[0];
+  const float3 e1 = V[2] - V[0];
+  const float3 e2 = V[2] - V[1];
+  const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
+  const float3 N = cross(e0, e1);
+  const float distance_to_plane = fabsf(dot(N, sd->I * t)) / dot(N, N);
+  const float area = 0.5f * len(N);
+
+  float pdf;
+
+  if (longest_edge_squared > distance_to_plane * distance_to_plane) {
+    /* sd contains the point on the light source
+     * calculate Px, the point that we're shading */
+    const float3 Px = sd->P + sd->I * t;
+    const float3 v0_p = V[0] - Px;
+    const float3 v1_p = V[1] - Px;
+    const float3 v2_p = V[2] - Px;
+
+    const float3 u01 = safe_normalize(cross(v0_p, v1_p));
+    const float3 u02 = safe_normalize(cross(v0_p, v2_p));
+    const float3 u12 = safe_normalize(cross(v1_p, v2_p));
+
+    const float alpha = fast_acosf(dot(u02, u01));
+    const float beta = fast_acosf(-dot(u01, u12));
+    const float gamma = fast_acosf(dot(u02, u12));
+    const float solid_angle = alpha + beta + gamma - M_PI_F;
+
+    /* distribution_pdf_triangles is calculated over triangle area, but we're not sampling over
+     * its area */
+    if (UNLIKELY(solid_angle == 0.0f)) {
+      return 0.0f;
+    }
+    else {
+      pdf = 1.0f / solid_angle;
+    }
+  }
+  else {
+    if (UNLIKELY(area == 0.0f)) {
+      return 0.0f;
+    }
+
+    pdf = triangle_light_pdf_area_sampling(sd->Ng, sd->I, t) / area;
+  }
+
+  /* Belongs in distribution.h but can reuse computations here. */
+  if (!kernel_data.integrator.use_light_tree) {
+    float distribution_area = area;
+
+    if (has_motion && area != 0.0f) {
+      /* For motion blur need area of triangle at fixed time as used in the CDF. */
+      triangle_world_space_vertices(kg, sd->object, sd->prim, -1.0f, V);
+      distribution_area = triangle_area(V[0], V[1], V[2]);
+    }
+
+    pdf *= distribution_area * kernel_data.integrator.distribution_pdf_triangles;
+  }
+
+  return pdf;
+}
+
+template<bool in_volume_segment>
+ccl_device_forceinline bool triangle_light_sample(KernelGlobals kg,
+                                                  int prim,
+                                                  int object,
+                                                  float randu,
+                                                  float randv,
+                                                  float time,
+                                                  ccl_private LightSample *ls,
+                                                  const float3 P)
+{
+  /* A naive heuristic to decide between costly solid angle sampling
+   * and simple area sampling, comparing the distance to the triangle plane
+   * to the length of the edges of the triangle. */
+
+  float3 V[3];
+  bool has_motion = triangle_world_space_vertices(kg, object, prim, time, V);
+
+  const float3 e0 = V[1] - V[0];
+  const float3 e1 = V[2] - V[0];
+  const float3 e2 = V[2] - V[1];
+  const float longest_edge_squared = max(len_squared(e0), max(len_squared(e1), len_squared(e2)));
+  const float3 N0 = cross(e0, e1);
+  float Nl = 0.0f;
+  ls->Ng = safe_normalize_len(N0, &Nl);
+  const float area = 0.5f * Nl;
+
+  /* flip normal if necessary */
+  const int object_flag = kernel_data_fetch(object_flag, object);
+  if (object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
+    ls->Ng = -ls->Ng;
+  }
+  ls->eval_fac = 1.0f;
+  ls->shader = kernel_data_fetch(tri_shader, prim);
+  ls->object = object;
+  ls->prim = prim;
+  ls->lamp = LAMP_NONE;
+  ls->shader |= SHADER_USE_MIS;
+  ls->type = LIGHT_TRIANGLE;
+  ls->group = object_lightgroup(kg, object);
+
+  float distance_to_plane = fabsf(dot(N0, V[0] - P) / dot(N0, N0));
+
+  if (!in_volume_segment && (longest_edge_squared > distance_to_plane * distance_to_plane)) {
+    /* see James Arvo, "Stratified Sampling of Spherical Triangles"
+     * http://www.graphics.cornell.edu/pubs/1995/Arv95c.pdf */
+
+    /* project the triangle to the unit sphere
+     * and calculate its edges and angles */
+    const float3 v0_p = V[0] - P;
+    const float3 v1_p = V[1] - P;
+    const float3 v2_p = V[2] - P;
+
+    const float3 u01 = safe_normalize(cross(v0_p, v1_p));
+    const float3 u02 = safe_normalize(cross(v0_p, v2_p));
+    const float3 u12 = safe_normalize(cross(v1_p, v2_p));
+
+    const float3 A = safe_normalize(v0_p);
+    const float3 B = safe_normalize(v1_p);
+    const float3 C = safe_normalize(v2_p);
+
+    const float cos_alpha = dot(u02, u01);
+    const float cos_beta = -dot(u01, u12);
+    const float cos_gamma = dot(u02, u12);
+
+    /* calculate dihedral angles */
+    const float alpha = fast_acosf(cos_alpha);
+    const float beta = fast_acosf(cos_beta);
+    const float gamma = fast_acosf(cos_gamma);
+    /* the area of the unit spherical triangle = solid angle */
+    const float solid_angle = alpha + beta + gamma - M_PI_F;
+
+    /* precompute a few things
+     * these could be re-used to take several samples
+     * as they are independent of randu/randv */
+    const float cos_c = dot(A, B);
+    const float sin_alpha = fast_sinf(alpha);
+    const float product = sin_alpha * cos_c;
+
+    /* Select a random sub-area of the spherical triangle
+     * and calculate the third vertex C_ of that new triangle */
+    const float phi = randu * solid_angle - alpha;
+    float s, t;
+    fast_sincosf(phi, &s, &t);
+    const float u = t - cos_alpha;
+    const float v = s + product;
+
+    const float3 U = safe_normalize(C - dot(C, A) * A);
+
+    float q = 1.0f;
+    const float det = ((v * s + u * t) * sin_alpha);
+    if (det != 0.0f) {
+      q = ((v * t - u * s) * cos_alpha - v) / det;
+    }
+    const float temp = max(1.0f - q * q, 0.0f);
+
+    const float3 C_ = safe_normalize(q * A + sqrtf(temp) * U);
+
+    /* Finally, select a random point along the edge of the new triangle
+     * That point on the spherical triangle is the sampled ray direction */
+    const float z = 1.0f - randv * (1.0f - dot(C_, B));
+    ls->D = z * B + safe_sqrtf(1.0f - z * z) * safe_normalize(C_ - dot(C_, B) * B);
+
+    /* calculate intersection with the planar triangle */
+    if (!ray_triangle_intersect(
+            P, ls->D, 0.0f, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) {
+      ls->pdf = 0.0f;
+      return false;
+    }
+
+    ls->P = P + ls->D * ls->t;
+
+    /* distribution_pdf_triangles is calculated over triangle area, but we're sampling over solid
+     * angle */
+    if (UNLIKELY(solid_angle == 0.0f)) {
+      ls->pdf = 0.0f;
+      return false;
+    }
+    else {
+      ls->pdf = 1.0f / solid_angle;
+    }
+  }
+  else {
+    if (UNLIKELY(area == 0.0f)) {
+      return 0.0f;
+    }
+
+    /* compute random point in triangle. From Eric Heitz's "A Low-Distortion Map Between Triangle
+     * and Square" */
+    float u = randu;
+    float v = randv;
+    if (v > u) {
+      u *= 0.5f;
+      v -= u;
+    }
+    else {
+      v *= 0.5f;
+      u -= v;
+    }
+
+    const float t = 1.0f - u - v;
+    ls->P = u * V[0] + v * V[1] + t * V[2];
+    /* compute incoming direction, distance and pdf */
+    ls->D = normalize_len(ls->P - P, &ls->t);
+    ls->pdf = triangle_light_pdf_area_sampling(ls->Ng, -ls->D, ls->t) / area;
+    ls->u = u;
+    ls->v = v;
+  }
+
+  /* Belongs in distribution.h but can reuse computations here. */
+  if (!kernel_data.integrator.use_light_tree) {
+    float distribution_area = area;
+
+    if (has_motion && area != 0.0f) {
+      /* For motion blur need area of triangle at fixed time as used in the CDF. */
+      triangle_world_space_vertices(kg, object, prim, -1.0f, V);
+      distribution_area = triangle_area(V[0], V[1], V[2]);
+    }
+
+    ls->pdf_selection = distribution_area * kernel_data.integrator.distribution_pdf_triangles;
+  }
+
+  return (ls->pdf > 0.0f);
+}
+
+template<bool in_volume_segment>
+ccl_device_forceinline bool triangle_light_tree_parameters(
+    KernelGlobals kg,
+    const ccl_global KernelLightTreeEmitter *kemitter,
+    const float3 centroid,
+    const float3 P,
+    const float3 N,
+    const BoundingCone bcone,
+    ccl_private float &cos_theta_u,
+    ccl_private float2 &distance,
+    ccl_private float3 &point_to_centroid)
+{
+  if (!in_volume_segment) {
+    /* TODO: a cheap substitute for minimal distance between point and primitive. Does it
+     * worth the overhead to compute the accurate minimal distance? */
+    float min_distance;
+    point_to_centroid = safe_normalize_len(centroid - P, &min_distance);
+    distance = make_float2(min_distance, min_distance);
+  }
+
+  cos_theta_u = FLT_MAX;
+
+  const int object = kemitter->mesh_light.object_id;
+  float3 vertices[3];
+  triangle_world_space_vertices(kg, object, kemitter->prim_id, -1.0f, vertices);
+
+  bool shape_above_surface = false;
+  for (int i = 0; i < 3; i++) {
+    const float3 corner = vertices[i];
+    float distance_point_to_corner;
+    const float3 point_to_corner = safe_normalize_len(corner - P, &distance_point_to_corner);
+    cos_theta_u = fminf(cos_theta_u, dot(point_to_centroid, point_to_corner));
+    shape_above_surface |= dot(point_to_corner, N) > 0;
+    if (!in_volume_segment) {
+      distance.x = fmaxf(distance.x, distance_point_to_corner);
+    }
+  }
+
+  const bool front_facing = bcone.theta_o != 0.0f || dot(bcone.axis, point_to_centroid) < 0;
+  const bool in_volume = is_zero(N);
+
+  return (front_facing && shape_above_surface) || in_volume;
+}
+
+CCL_NAMESPACE_END
--- a/Show More
+++ b/Show More