Merge branch 'master' into temp-ui-cpp

Cleanup: Move interface_intern.hh
The entire interface directory is now compiled as C++ files!
2022-11-26 00:22:49 -06:00 · 2022-11-26 00:21:17 -06:00 · 2022-11-26 00:01:49 -06:00 · 2022-11-25 23:48:33 -06:00 · 2022-11-25 23:48:18 -06:00 · 2022-11-25 19:55:04 -05:00
1287 changed files with 56289 additions and 30420 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1239,12 +1239,11 @@ if(WITH_OPENGL)
  add_definitions(-DWITH_OPENGL)
 endif()

-
-# -----------------------------------------------------------------------------
+#-----------------------------------------------------------------------------
 # Configure Vulkan.

 if(WITH_VULKAN_BACKEND)
-  add_definitions(-DWITH_VULKAN_BACKEND)
+  list(APPEND BLENDER_GL_LIBRARIES ${VULKAN_LIBRARIES})
 endif()

 # -----------------------------------------------------------------------------
--- a/build_files/build_environment/cmake/download.cmake
+++ b/build_files/build_environment/cmake/download.cmake
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-or-later

 ## Update and uncomment this in the release branch
-set(BLENDER_VERSION 3.4)
+# set(BLENDER_VERSION 3.1)

 function(download_source dep)
  set(TARGET_FILE ${${dep}_FILE})
--- a/build_files/cmake/Modules/FindMoltenVK.cmake
+++ b/build_files/cmake/Modules/FindMoltenVK.cmake
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2022 Blender Foundation.
+
+# - Find MoltenVK libraries
+# Find the MoltenVK includes and libraries
+# This module defines
+#  MOLTENVK_INCLUDE_DIRS, where to find MoltenVK headers, Set when
+#                        MOLTENVK_INCLUDE_DIR is found.
+#  MOLTENVK_LIBRARIES, libraries to link against to use MoltenVK.
+#  MOLTENVK_ROOT_DIR, The base directory to search for MoltenVK.
+#                    This can also be an environment variable.
+#  MOLTENVK_FOUND, If false, do not try to use MoltenVK.
+#
+
+# If MOLTENVK_ROOT_DIR was defined in the environment, use it.
+IF(NOT MOLTENVK_ROOT_DIR AND NOT $ENV{MOLTENVK_ROOT_DIR} STREQUAL "")
+  SET(MOLTENVK_ROOT_DIR $ENV{MOLTENVK_ROOT_DIR})
+ENDIF()
+
+SET(_moltenvk_SEARCH_DIRS
+  ${MOLTENVK_ROOT_DIR}
+  ${LIBDIR}/vulkan/MoltenVK
+)
+
+
+FIND_PATH(MOLTENVK_INCLUDE_DIR
+  NAMES
+    MoltenVK/vk_mvk_moltenvk.h
+  HINTS
+    ${_moltenvk_SEARCH_DIRS}
+  PATH_SUFFIXES
+    include
+)
+
+FIND_LIBRARY(MOLTENVK_LIBRARY
+  NAMES
+    MoltenVK
+  HINTS
+    ${_moltenvk_SEARCH_DIRS}
+  PATH_SUFFIXES
+    dylib/macOS
+)
+
+# handle the QUIETLY and REQUIRED arguments and set MOLTENVK_FOUND to TRUE if
+# all listed variables are TRUE
+INCLUDE(FindPackageHandleStandardArgs)
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(MoltenVK DEFAULT_MSG MOLTENVK_LIBRARY MOLTENVK_INCLUDE_DIR)
+
+IF(MOLTENVK_FOUND)
+  SET(MOLTENVK_LIBRARIES ${MOLTENVK_LIBRARY})
+  SET(MOLTENVK_INCLUDE_DIRS ${MOLTENVK_INCLUDE_DIR})
+ENDIF()
+
+MARK_AS_ADVANCED(
+  MOLTENVK_INCLUDE_DIR
+  MOLTENVK_LIBRARY
+)
+
+UNSET(_moltenvk_SEARCH_DIRS)
--- a/build_files/cmake/platform/platform_apple.cmake
+++ b/build_files/cmake/platform/platform_apple.cmake
@@ -100,6 +100,23 @@ if(WITH_USD)
  find_package(USD REQUIRED)
 endif()

+if(WITH_VULKAN_BACKEND)
+  find_package(MoltenVK REQUIRED)
+
+  if(EXISTS ${LIBDIR}/vulkan)
+    set(VULKAN_FOUND On)
+    set(VULKAN_ROOT_DIR ${LIBDIR}/vulkan/macOS)
+    set(VULKAN_INCLUDE_DIR ${VULKAN_ROOT_DIR}/include)
+    set(VULKAN_LIBRARY ${VULKAN_ROOT_DIR}/lib/libvulkan.1.dylib)
+
+    set(VULKAN_INCLUDE_DIRS ${VULKAN_INCLUDE_DIR} ${MOLTENVK_INCLUDE_DIRS})
+    set(VULKAN_LIBRARIES ${VULKAN_LIBRARY} ${MOLTENVK_LIBRARIES})
+  else()
+    message(WARNING "Vulkan SDK was not found, disabling WITH_VULKAN_BACKEND")
+    set(WITH_VULKAN_BACKEND OFF)
+  endif()
+endif()
+
 if(WITH_OPENSUBDIV)
  find_package(OpenSubdiv)
 endif()
--- a/build_files/cmake/platform/platform_unix.cmake
+++ b/build_files/cmake/platform/platform_unix.cmake
@@ -108,6 +108,10 @@ find_package_wrapper(ZLIB REQUIRED)
 find_package_wrapper(Zstd REQUIRED)
 find_package_wrapper(Epoxy REQUIRED)

+if(WITH_VULKAN_BACKEND)
+  find_package_wrapper(Vulkan REQUIRED)
+endif()
+
 function(check_freetype_for_brotli)
  include(CheckSymbolExists)
  set(CMAKE_REQUIRED_INCLUDES ${FREETYPE_INCLUDE_DIRS})
@@ -322,9 +326,10 @@ if(WITH_CYCLES AND WITH_CYCLES_DEVICE_ONEAPI)
  file(GLOB _sycl_runtime_libraries
    ${SYCL_ROOT_DIR}/lib/libsycl.so
    ${SYCL_ROOT_DIR}/lib/libsycl.so.*
-    ${SYCL_ROOT_DIR}/lib/libpi_level_zero.so
+    ${SYCL_ROOT_DIR}/lib/libpi_*.so
  )
  list(FILTER _sycl_runtime_libraries EXCLUDE REGEX ".*\.py")
+  list(REMOVE_ITEM _sycl_runtime_libraries "${SYCL_ROOT_DIR}/lib/libpi_opencl.so")
  list(APPEND PLATFORM_BUNDLED_LIBRARIES ${_sycl_runtime_libraries})
  unset(_sycl_runtime_libraries)
 endif()
--- a/build_files/cmake/platform/platform_win32.cmake
+++ b/build_files/cmake/platform/platform_win32.cmake
@@ -419,7 +419,7 @@ if(WITH_IMAGE_OPENEXR)
    warn_hardcoded_paths(OpenEXR)
    set(OPENEXR ${LIBDIR}/openexr)
    set(OPENEXR_INCLUDE_DIR ${OPENEXR}/include)
-    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${IMATH_INCLUDE_DIRS} ${OPENEXR}/include/OpenEXR)
+    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${IMATH_INCLUDE_DIRS} ${OPENEXR_INCLUDE_DIR}/OpenEXR)
    set(OPENEXR_LIBPATH ${OPENEXR}/lib)
    # Check if the 3.x library name exists
    # if not assume this is a 2.x library folder
@@ -568,7 +568,8 @@ if(WITH_OPENIMAGEIO)
  if(NOT OpenImageIO_FOUND)
    set(OPENIMAGEIO ${LIBDIR}/OpenImageIO)
    set(OPENIMAGEIO_LIBPATH ${OPENIMAGEIO}/lib)
-    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO}/include)
+    set(OPENIMAGEIO_INCLUDE_DIR ${OPENIMAGEIO}/include)
+    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO_INCLUDE_DIR})
    set(OIIO_OPTIMIZED optimized ${OPENIMAGEIO_LIBPATH}/OpenImageIO.lib optimized ${OPENIMAGEIO_LIBPATH}/OpenImageIO_Util.lib)
    set(OIIO_DEBUG debug ${OPENIMAGEIO_LIBPATH}/OpenImageIO_d.lib debug ${OPENIMAGEIO_LIBPATH}/OpenImageIO_Util_d.lib)
    set(OPENIMAGEIO_LIBRARIES ${OIIO_OPTIMIZED} ${OIIO_DEBUG})
@@ -785,6 +786,14 @@ if(WITH_CYCLES AND WITH_CYCLES_OSL)
  endif()
  find_path(OSL_INCLUDE_DIR OSL/oslclosure.h PATHS ${CYCLES_OSL}/include)
  find_program(OSL_COMPILER NAMES oslc PATHS ${CYCLES_OSL}/bin)
+  file(STRINGS "${OSL_INCLUDE_DIR}/OSL/oslversion.h" OSL_LIBRARY_VERSION_MAJOR
+       REGEX "^[ \t]*#define[ \t]+OSL_LIBRARY_VERSION_MAJOR[ \t]+[0-9]+.*$")
+  file(STRINGS "${OSL_INCLUDE_DIR}/OSL/oslversion.h" OSL_LIBRARY_VERSION_MINOR
+       REGEX "^[ \t]*#define[ \t]+OSL_LIBRARY_VERSION_MINOR[ \t]+[0-9]+.*$")
+  string(REGEX REPLACE ".*#define[ \t]+OSL_LIBRARY_VERSION_MAJOR[ \t]+([.0-9]+).*"
+         "\\1" OSL_LIBRARY_VERSION_MAJOR ${OSL_LIBRARY_VERSION_MAJOR})
+  string(REGEX REPLACE ".*#define[ \t]+OSL_LIBRARY_VERSION_MINOR[ \t]+([.0-9]+).*"
+         "\\1" OSL_LIBRARY_VERSION_MINOR ${OSL_LIBRARY_VERSION_MINOR})
 endif()

 if(WITH_CYCLES AND WITH_CYCLES_EMBREE)
@@ -917,6 +926,20 @@ if(WITH_HARU)
  set(HARU_LIBRARIES ${HARU_ROOT_DIR}/lib/libhpdfs.lib)
 endif()

+if(WITH_VULKAN_BACKEND)
+  if(EXISTS ${LIBDIR}/vulkan)
+    set(VULKAN_FOUND On)
+    set(VULKAN_ROOT_DIR ${LIBDIR}/vulkan)
+    set(VULKAN_INCLUDE_DIR ${VULKAN_ROOT_DIR}/include)
+    set(VULKAN_INCLUDE_DIRS ${VULKAN_INCLUDE_DIR})
+    set(VULKAN_LIBRARY ${VULKAN_ROOT_DIR}/lib/vulkan-1.lib)
+    set(VULKAN_LIBRARIES ${VULKAN_LIBRARY})
+  else()
+    message(WARNING "Vulkan SDK was not found, disabling WITH_VULKAN_BACKEND")
+    set(WITH_VULKAN_BACKEND OFF)
+  endif()
+endif()
+
 if(WITH_CYCLES AND WITH_CYCLES_PATH_GUIDING)
  find_package(openpgl QUIET)
  if(openpgl_FOUND)
@@ -949,7 +972,13 @@ if(WITH_CYCLES AND WITH_CYCLES_DEVICE_ONEAPI)
  endforeach()
  unset(_sycl_runtime_libraries_glob)

-  list(APPEND _sycl_runtime_libraries ${SYCL_ROOT_DIR}/bin/pi_level_zero.dll)
+  file(GLOB _sycl_pi_runtime_libraries_glob
+    ${SYCL_ROOT_DIR}/bin/pi_*.dll
+  )
+  list(REMOVE_ITEM _sycl_pi_runtime_libraries_glob "${SYCL_ROOT_DIR}/bin/pi_opencl.dll")
+  list (APPEND _sycl_runtime_libraries ${_sycl_pi_runtime_libraries_glob})
+  unset(_sycl_pi_runtime_libraries_glob)
+
  list(APPEND PLATFORM_BUNDLED_LIBRARIES ${_sycl_runtime_libraries})
  unset(_sycl_runtime_libraries)
 endif()
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -5,38 +5,38 @@
 update-code:
    git:
        submodules:
-        -   branch: blender-v3.4-release
+        -   branch: master
            commit_id: HEAD
            path: release/scripts/addons
-        -   branch: blender-v3.4-release
+        -   branch: master
            commit_id: HEAD
            path: release/scripts/addons_contrib
-        -   branch: blender-v3.4-release
+        -   branch: master
            commit_id: HEAD
            path: release/datafiles/locale
-        -   branch: blender-v3.4-release
+        -   branch: master
            commit_id: HEAD
            path: source/tools
    svn:
        libraries:
            darwin-arm64:
-                branch: tags/blender-3.4-release
+                branch: trunk
                commit_id: HEAD
                path: lib/darwin_arm64
            darwin-x86_64:
-                branch: tags/blender-3.4-release
+                branch: trunk
                commit_id: HEAD
                path: lib/darwin
            linux-x86_64:
-                branch: tags/blender-3.4-release
+                branch: trunk
                commit_id: HEAD
                path: lib/linux_centos7_x86_64
            windows-amd64:
-                branch: tags/blender-3.4-release
+                branch: trunk
                commit_id: HEAD
                path: lib/win64_vc15
        tests:
-            branch: tags/blender-3.4-release
+            branch: trunk
            commit_id: HEAD
            path: lib/tests
        benchmarks:
--- a/build_files/utils/make_bpy_wheel.py
+++ b/build_files/utils/make_bpy_wheel.py
@@ -53,18 +53,7 @@ This package provides Blender as a Python module for use in studio pipelines, we

 [System requirements](https://www.blender.org/download/requirements/) are the same as Blender.

-Each Blender release supports one Python version, and the package is only compatible with that version.
-
-## Source Code
-
-* [Releases](https://download.blender.org/source/)
-* Repository: [git.blender.org/blender.git](https://git.blender.org/gitweb/gitweb.cgi/blender.git)
-
-## Credits
-
-Created by the [Blender developer community](https://www.blender.org/about/credits/).
-
-Thanks to Tyler Alden Gubala for maintaining the original version of this package."""
+Each Blender release supports one Python version, and the package is only compatible with that version."""

 # ------------------------------------------------------------------------------
 # Generic Functions
--- a/doc/doxygen/Doxyfile
+++ b/doc/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = Blender
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = V3.4
+PROJECT_NUMBER         = V3.5

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -91,3 +91,7 @@ endif()
 if(WITH_COMPOSITOR_CPU)
  add_subdirectory(smaa_areatex)
 endif()
+
+if(WITH_VULKAN_BACKEND)
+  add_subdirectory(vulkan_memory_allocator)
+endif()
--- a/extern/vulkan_memory_allocator/CMakeLists.txt
+++ b/extern/vulkan_memory_allocator/CMakeLists.txt
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright 2022 Blender Foundation. All rights reserved.
+
+set(INC
+  .
+)
+
+set(INC_SYS
+  ${VULKAN_INCLUDE_DIRS}
+)
+
+set(SRC
+  vk_mem_alloc_impl.cc
+
+  vk_mem_alloc.h
+)
+
+blender_add_lib(extern_vulkan_memory_allocator "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
+
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang")
+  target_compile_options(extern_vulkan_memory_allocator
+    PRIVATE "-Wno-nullability-completeness"
+  )
+endif()
--- a/extern/vulkan_memory_allocator/LICENSE.txt
+++ b/extern/vulkan_memory_allocator/LICENSE.txt
@@ -0,0 +1,19 @@
+Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/extern/vulkan_memory_allocator/README.blender
+++ b/extern/vulkan_memory_allocator/README.blender
@@ -0,0 +1,5 @@
+Project: VulkanMemoryAllocator
+URL: https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator
+License: MIT
+Upstream version: a6bfc23
+Local modifications: None
--- a/extern/vulkan_memory_allocator/README.md
+++ b/extern/vulkan_memory_allocator/README.md
@@ -0,0 +1,175 @@
+# Vulkan Memory Allocator
+
+Easy to integrate Vulkan memory allocation library.
+
+**Documentation:** Browse online: [Vulkan Memory Allocator](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/) (generated from Doxygen-style comments in [include/vk_mem_alloc.h](include/vk_mem_alloc.h))
+
+**License:** MIT. See [LICENSE.txt](LICENSE.txt)
+
+**Changelog:** See [CHANGELOG.md](CHANGELOG.md)
+
+**Product page:** [Vulkan Memory Allocator on GPUOpen](https://gpuopen.com/gaming-product/vulkan-memory-allocator/)
+
+**Build status:**
+
+- Windows: [![Build status](https://ci.appveyor.com/api/projects/status/4vlcrb0emkaio2pn/branch/master?svg=true)](https://ci.appveyor.com/project/adam-sawicki-amd/vulkanmemoryallocator/branch/master)  
+- Linux: [![Build Status](https://app.travis-ci.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.svg?branch=master)](https://app.travis-ci.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator)
+
+[![Average time to resolve an issue](http://isitmaintained.com/badge/resolution/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator.svg)](http://isitmaintained.com/project/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator "Average time to resolve an issue")
+
+# Problem
+
+Memory allocation and resource (buffer and image) creation in Vulkan is difficult (comparing to older graphics APIs, like D3D11 or OpenGL) for several reasons:
+
+- It requires a lot of boilerplate code, just like everything else in Vulkan, because it is a low-level and high-performance API.
+- There is additional level of indirection: `VkDeviceMemory` is allocated separately from creating `VkBuffer`/`VkImage` and they must be bound together.
+- Driver must be queried for supported memory heaps and memory types. Different GPU vendors provide different types of it.
+- It is recommended to allocate bigger chunks of memory and assign parts of them to particular resources, as there is a limit on maximum number of memory blocks that can be allocated.
+
+# Features
+
+This library can help game developers to manage memory allocations and resource creation by offering some higher-level functions:
+
+1. Functions that help to choose correct and optimal memory type based on intended usage of the memory.
+   - Required or preferred traits of the memory are expressed using higher-level description comparing to Vulkan flags.
+2. Functions that allocate memory blocks, reserve and return parts of them (`VkDeviceMemory` + offset + size) to the user.
+   - Library keeps track of allocated memory blocks, used and unused ranges inside them, finds best matching unused ranges for new allocations, respects all the rules of alignment and buffer/image granularity.
+3. Functions that can create an image/buffer, allocate memory for it and bind them together - all in one call.
+
+Additional features:
+
+- Well-documented - description of all functions and structures provided, along with chapters that contain general description and example code.
+- Thread-safety: Library is designed to be used in multithreaded code. Access to a single device memory block referred by different buffers and textures (binding, mapping) is synchronized internally. Memory mapping is reference-counted.
+- Configuration: Fill optional members of `VmaAllocatorCreateInfo` structure to provide custom CPU memory allocator, pointers to Vulkan functions and other parameters.
+- Customization and integration with custom engines: Predefine appropriate macros to provide your own implementation of all external facilities used by the library like assert, mutex, atomic.
+- Support for memory mapping, reference-counted internally. Support for persistently mapped memory: Just allocate with appropriate flag and access the pointer to already mapped memory.
+- Support for non-coherent memory. Functions that flush/invalidate memory. `nonCoherentAtomSize` is respected automatically.
+- Support for resource aliasing (overlap).
+- Support for sparse binding and sparse residency: Convenience functions that allocate or free multiple memory pages at once.
+- Custom memory pools: Create a pool with desired parameters (e.g. fixed or limited maximum size) and allocate memory out of it.
+- Linear allocator: Create a pool with linear algorithm and use it for much faster allocations and deallocations in free-at-once, stack, double stack, or ring buffer fashion.
+- Support for Vulkan 1.0, 1.1, 1.2, 1.3.
+- Support for extensions (and equivalent functionality included in new Vulkan versions):
+   - VK_KHR_dedicated_allocation: Just enable it and it will be used automatically by the library.
+   - VK_KHR_buffer_device_address: Flag `VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR` is automatically added to memory allocations where needed.
+   - VK_EXT_memory_budget: Used internally if available to query for current usage and budget. If not available, it falls back to an estimation based on memory heap sizes.
+   - VK_EXT_memory_priority: Set `priority` of allocations or custom pools and it will be set automatically using this extension.
+   - VK_AMD_device_coherent_memory
+- Defragmentation of GPU and CPU memory: Let the library move data around to free some memory blocks and make your allocations better compacted.
+- Statistics: Obtain brief or detailed statistics about the amount of memory used, unused, number of allocated blocks, number of allocations etc. - globally, per memory heap, and per memory type.
+- Debug annotations: Associate custom `void* pUserData` and debug `char* pName` with each allocation.
+- JSON dump: Obtain a string in JSON format with detailed map of internal state, including list of allocations, their string names, and gaps between them.
+- Convert this JSON dump into a picture to visualize your memory. See [tools/GpuMemDumpVis](tools/GpuMemDumpVis/README.md).
+- Debugging incorrect memory usage: Enable initialization of all allocated memory with a bit pattern to detect usage of uninitialized or freed memory. Enable validation of a magic number after every allocation to detect out-of-bounds memory corruption.
+- Support for interoperability with OpenGL.
+- Virtual allocator: Interface for using core allocation algorithm to allocate any custom data, e.g. pieces of one large buffer.
+
+# Prerequisites
+
+- Self-contained C++ library in single header file. No external dependencies other than standard C and C++ library and of course Vulkan. Some features of C++14 used. STL containers, RTTI, or C++ exceptions are not used.
+- Public interface in C, in same convention as Vulkan API. Implementation in C++.
+- Error handling implemented by returning `VkResult` error codes - same way as in Vulkan.
+- Interface documented using Doxygen-style comments.
+- Platform-independent, but developed and tested on Windows using Visual Studio. Continuous integration setup for Windows and Linux. Used also on Android, MacOS, and other platforms.
+
+# Example
+
+Basic usage of this library is very simple. Advanced features are optional. After you created global `VmaAllocator` object, a complete code needed to create a buffer may look like this:
+
+```cpp
+VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
+bufferInfo.size = 65536;
+bufferInfo.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+
+VmaAllocationCreateInfo allocInfo = {};
+allocInfo.usage = VMA_MEMORY_USAGE_AUTO;
+
+VkBuffer buffer;
+VmaAllocation allocation;
+vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr);
+```
+
+With this one function call:
+
+1. `VkBuffer` is created.
+2. `VkDeviceMemory` block is allocated if needed.
+3. An unused region of the memory block is bound to this buffer.
+
+`VmaAllocation` is an object that represents memory assigned to this buffer. It can be queried for parameters like `VkDeviceMemory` handle and offset.
+
+# How to build
+
+On Windows it is recommended to use [CMake UI](https://cmake.org/runningcmake/). Alternatively you can generate a Visual Studio project map using CMake in command line: `cmake -B./build/ -DCMAKE_BUILD_TYPE=Debug -G "Visual Studio 16 2019" -A x64 ./`
+
+On Linux:
+
+```
+mkdir build
+cd build
+cmake ..
+make
+```
+
+The following targets are available
+
+| Target | Description | CMake option | Default setting |
+| ------------- | ------------- | ------------- | ------------- |
+| VmaSample | VMA sample application | `VMA_BUILD_SAMPLE` | `OFF` |
+| VmaBuildSampleShaders | Shaders for VmaSample | `VMA_BUILD_SAMPLE_SHADERS` | `OFF` |
+
+Please note that while VulkanMemoryAllocator library is supported on other platforms besides Windows, VmaSample is not.
+
+These CMake options are available
+
+| CMake option | Description | Default setting |
+| ------------- | ------------- | ------------- |
+| `VMA_RECORDING_ENABLED` | Enable VMA memory recording for debugging | `OFF` |
+| `VMA_USE_STL_CONTAINERS` | Use C++ STL containers instead of VMA's containers | `OFF` |
+| `VMA_STATIC_VULKAN_FUNCTIONS` | Link statically with Vulkan API | `OFF` |
+| `VMA_DYNAMIC_VULKAN_FUNCTIONS` | Fetch pointers to Vulkan functions internally (no static linking) | `ON` |
+| `VMA_DEBUG_ALWAYS_DEDICATED_MEMORY` | Every allocation will have its own memory block | `OFF` |
+| `VMA_DEBUG_INITIALIZE_ALLOCATIONS` | Automatically fill new allocations and destroyed allocations with some bit pattern | `OFF` |
+| `VMA_DEBUG_GLOBAL_MUTEX` | Enable single mutex protecting all entry calls to the library | `OFF` |
+| `VMA_DEBUG_DONT_EXCEED_MAX_MEMORY_ALLOCATION_COUNT` | Never exceed [VkPhysicalDeviceLimits::maxMemoryAllocationCount](https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#limits-maxMemoryAllocationCount) and return error | `OFF` |
+
+# Binaries
+
+The release comes with precompiled binary executable for "VulkanSample" application which contains test suite. It is compiled using Visual Studio 2019, so it requires appropriate libraries to work, including "MSVCP140.dll", "VCRUNTIME140.dll", "VCRUNTIME140_1.dll". If the launch fails with error message telling about those files missing, please download and install [Microsoft Visual C++ Redistributable for Visual Studio 2015, 2017 and 2019](https://support.microsoft.com/en-us/help/2977003/the-latest-supported-visual-c-downloads), "x64" version.
+
+# Read more
+
+See **[Documentation](https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/)**.
+
+# Software using this library
+
+- **[X-Plane](https://x-plane.com/)**
+- **[Detroit: Become Human](https://gpuopen.com/learn/porting-detroit-3/)**
+- **[Vulkan Samples](https://github.com/LunarG/VulkanSamples)** - official Khronos Vulkan samples. License: Apache-style.
+- **[Anvil](https://github.com/GPUOpen-LibrariesAndSDKs/Anvil)** - cross-platform framework for Vulkan. License: MIT.
+- **[Filament](https://github.com/google/filament)** - physically based rendering engine for Android, Windows, Linux and macOS, from Google. Apache License 2.0.
+- **[Atypical Games - proprietary game engine](https://developer.samsung.com/galaxy-gamedev/gamedev-blog/infinitejet.html)**
+- **[Flax Engine](https://flaxengine.com/)**
+- **[Godot Engine](https://github.com/godotengine/godot/)** - multi-platform 2D and 3D game engine. License: MIT.
+- **[Lightweight Java Game Library (LWJGL)](https://www.lwjgl.org/)** - includes binding of the library for Java. License: BSD.
+- **[PowerVR SDK](https://github.com/powervr-graphics/Native_SDK)** - C++ cross-platform 3D graphics SDK, from Imagination. License: MIT.
+- **[Skia](https://github.com/google/skia)** - complete 2D graphic library for drawing Text, Geometries, and Images, from Google.
+- **[The Forge](https://github.com/ConfettiFX/The-Forge)** - cross-platform rendering framework. Apache License 2.0.
+- **[VK9](https://github.com/disks86/VK9)** - Direct3D 9 compatibility layer using Vulkan. Zlib lincese.
+- **[vkDOOM3](https://github.com/DustinHLand/vkDOOM3)** - Vulkan port of GPL DOOM 3 BFG Edition. License: GNU GPL.
+- **[vkQuake2](https://github.com/kondrak/vkQuake2)** - vanilla Quake 2 with Vulkan support. License: GNU GPL.
+- **[Vulkan Best Practice for Mobile Developers](https://github.com/ARM-software/vulkan_best_practice_for_mobile_developers)** from ARM. License: MIT.
+- **[RPCS3](https://github.com/RPCS3/rpcs3)** - PlayStation 3 emulator/debugger. License: GNU GPLv2.
+- **[PPSSPP](https://github.com/hrydgard/ppsspp)** - Playstation Portable emulator/debugger. License: GNU GPLv2+.
+
+[Many other projects on GitHub](https://github.com/search?q=AMD_VULKAN_MEMORY_ALLOCATOR_H&type=Code) and some game development studios that use Vulkan in their games.
+
+# See also
+
+- **[D3D12 Memory Allocator](https://github.com/GPUOpen-LibrariesAndSDKs/D3D12MemoryAllocator)** - equivalent library for Direct3D 12. License: MIT.
+- **[Awesome Vulkan](https://github.com/vinjn/awesome-vulkan)** - a curated list of awesome Vulkan libraries, debuggers and resources.
+- **[VulkanMemoryAllocator-Hpp](https://github.com/malte-v/VulkanMemoryAllocator-Hpp)** - C++ binding for this library. License: CC0-1.0.
+- **[PyVMA](https://github.com/realitix/pyvma)** - Python wrapper for this library. Author: Jean-Sébastien B. (@realitix). License: Apache 2.0.
+- **[vk-mem](https://github.com/gwihlidal/vk-mem-rs)** - Rust binding for this library. Author: Graham Wihlidal. License: Apache 2.0 or MIT.
+- **[Haskell bindings](https://hackage.haskell.org/package/VulkanMemoryAllocator)**, **[github](https://github.com/expipiplus1/vulkan/tree/master/VulkanMemoryAllocator)** - Haskell bindings for this library. Author: Ellie Hermaszewska (@expipiplus1). License BSD-3-Clause.
+- **[vma_sample_sdl](https://github.com/rextimmy/vma_sample_sdl)** - SDL port of the sample app of this library (with the goal of running it on multiple platforms, including MacOS). Author: @rextimmy. License: MIT.
+- **[vulkan-malloc](https://github.com/dylanede/vulkan-malloc)** - Vulkan memory allocation library for Rust. Based on version 1 of this library. Author: Dylan Ede (@dylanede). License: MIT / Apache 2.0.
--- a/extern/vulkan_memory_allocator/vk_mem_alloc.h
+++ b/extern/vulkan_memory_allocator/vk_mem_alloc.h
--- a/extern/vulkan_memory_allocator/vk_mem_alloc_impl.cc
+++ b/extern/vulkan_memory_allocator/vk_mem_alloc_impl.cc
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright 2022 Blender Foundation. All rights reserved. */
+
+#ifdef __APPLE__
+#  include <MoltenVK/vk_mvk_moltenvk.h>
+#else
+#  include <vulkan/vulkan.h>
+#endif
+
+#define VMA_IMPLEMENTATION
+
+#include "vk_mem_alloc.h"
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -253,6 +253,33 @@ if(WITH_CYCLES_OSL)
  )
 endif()

+if(WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX)
+  add_definitions(-DWITH_CUDA)
+
+  if(WITH_CUDA_DYNLOAD)
+    include_directories(
+      ../../extern/cuew/include
+    )
+    add_definitions(-DWITH_CUDA_DYNLOAD)
+  else()
+    include_directories(
+      SYSTEM
+      ${CUDA_TOOLKIT_INCLUDE}
+    )
+  endif()
+endif()
+
+if(WITH_CYCLES_DEVICE_HIP)
+  add_definitions(-DWITH_HIP)
+
+  if(WITH_HIP_DYNLOAD)
+    include_directories(
+      ../../extern/hipew/include
+    )
+    add_definitions(-DWITH_HIP_DYNLOAD)
+  endif()
+endif()
+
 if(WITH_CYCLES_DEVICE_OPTIX)
  find_package(OptiX 7.3.0)

@@ -261,12 +288,16 @@ if(WITH_CYCLES_DEVICE_OPTIX)
    include_directories(
      SYSTEM
      ${OPTIX_INCLUDE_DIR}
-      )
+    )
  else()
    set_and_warn_library_found("OptiX" OPTIX_FOUND WITH_CYCLES_DEVICE_OPTIX)
  endif()
 endif()

+if(WITH_CYCLES_DEVICE_METAL)
+  add_definitions(-DWITH_METAL)
+endif()
+
 if (WITH_CYCLES_DEVICE_ONEAPI)
  add_definitions(-DWITH_ONEAPI)
 endif()
--- a/intern/cycles/blender/addon/init.py
+++ b/intern/cycles/blender/addon/init.py
@@ -58,7 +58,7 @@ class CyclesRender(bpy.types.RenderEngine):
        if not self.session:
            if self.is_preview:
                cscene = bpy.context.scene.cycles
-                use_osl = cscene.shading_system and cscene.device == 'CPU'
+                use_osl = cscene.shading_system

                engine.create(self, data, preview_osl=use_osl)
            else:
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -156,6 +156,11 @@ def with_osl():
    return _cycles.with_osl


+def osl_version():
+    import _cycles
+    return _cycles.osl_version
+
+
 def with_path_guiding():
    import _cycles
    return _cycles.with_path_guiding
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -290,7 +290,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
    )
    shading_system: BoolProperty(
        name="Open Shading Language",
-        description="Use Open Shading Language (CPU rendering only)",
+        description="Use Open Shading Language",
    )

    preview_pause: BoolProperty(
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -2307,7 +2307,10 @@ def draw_device(self, context):
        col.prop(cscene, "device")

        from . import engine
-        if engine.with_osl() and use_cpu(context):
+        if engine.with_osl() and (
+                use_cpu(context) or
+                (use_optix(context) and (engine.osl_version()[1] >= 13 or engine.osl_version()[0] > 1))
+        ):
            col.prop(cscene, "shading_system")


--- a/intern/cycles/blender/mesh.cpp
+++ b/intern/cycles/blender/mesh.cpp
@@ -367,11 +367,13 @@ static void attr_create_generic(Scene *scene,
 {
  AttributeSet &attributes = (subdivision) ? mesh->subd_attributes : mesh->attributes;
  static const ustring u_velocity("velocity");
-  const ustring default_color_name{b_mesh.attributes.default_color_name().c_str()};
+
+  int attribute_index = 0;
+  int render_color_index = b_mesh.attributes.render_color_index();

  for (BL::Attribute &b_attribute : b_mesh.attributes) {
    const ustring name{b_attribute.name().c_str()};
-    const bool is_render_color = name == default_color_name;
+    const bool is_render_color = (attribute_index++ == render_color_index);

    if (need_motion && name == u_velocity) {
      attr_create_motion(mesh, b_attribute, motion_scale);
@@ -1083,11 +1085,11 @@ static void create_subd_mesh(Scene *scene,
  const int edges_num = b_mesh.edges.length();

  if (edges_num != 0 && b_mesh.edge_creases.length() > 0) {
-    BL::MeshEdgeCreaseLayer creases = b_mesh.edge_creases[0];
-
    size_t num_creases = 0;
+    const float *creases = static_cast<float *>(b_mesh.edge_creases[0].ptr.data);
+
    for (int i = 0; i < edges_num; i++) {
-      if (creases.data[i].value() != 0.0f) {
+      if (creases[i] != 0.0f) {
        num_creases++;
      }
    }
@@ -1096,18 +1098,17 @@ static void create_subd_mesh(Scene *scene,

    const MEdge *edges = static_cast<MEdge *>(b_mesh.edges[0].ptr.data);
    for (int i = 0; i < edges_num; i++) {
-      const float crease = creases.data[i].value();
-      if (crease != 0.0f) {
+      if (creases[i] != 0.0f) {
        const MEdge &b_edge = edges[i];
-        mesh->add_edge_crease(b_edge.v1, b_edge.v2, crease);
+        mesh->add_edge_crease(b_edge.v1, b_edge.v2, creases[i]);
      }
    }
-  }

-  for (BL::MeshVertexCreaseLayer &c : b_mesh.vertex_creases) {
-    for (int i = 0; i < c.data.length(); ++i) {
-      if (c.data[i].value() != 0.0f) {
-        mesh->add_vertex_crease(i, c.data[i].value());
+    for (BL::MeshVertexCreaseLayer &c : b_mesh.vertex_creases) {
+      for (int i = 0; i < c.data.length(); ++i) {
+        if (c.data[i].value() != 0.0f) {
+          mesh->add_vertex_crease(i, c.data[i].value());
+        }
      }
    }
  }
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -1,6 +1,584 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2011-2022 Blender Foundation

+###########################################################################
+# Helper macros
+###########################################################################
+
+macro(_set_default variable value)
+  if(NOT ${variable})
+    set(${variable} ${value})
+  endif()
+endmacro()
+
+###########################################################################
+# Precompiled libraries detection
+#
+# Use precompiled libraries from Blender repository
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY)
+  if(APPLE)
+    if("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "x86_64")
+      set(_cycles_lib_dir "${CMAKE_SOURCE_DIR}/../lib/darwin")
+    else()
+      set(_cycles_lib_dir "${CMAKE_SOURCE_DIR}/../lib/darwin_arm64")
+    endif()
+
+    # Always use system zlib
+    find_package(ZLIB REQUIRED)
+  elseif(WIN32)
+    if(CMAKE_CL_64)
+      set(_cycles_lib_dir "${CMAKE_SOURCE_DIR}/../lib/win64_vc15")
+    else()
+      message(FATAL_ERROR "Unsupported Visual Studio Version")
+    endif()
+  else()
+    # Path to a locally compiled libraries.
+    set(LIBDIR_NAME ${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR})
+    string(TOLOWER ${LIBDIR_NAME} LIBDIR_NAME)
+    set(LIBDIR_NATIVE_ABI ${CMAKE_SOURCE_DIR}/../lib/${LIBDIR_NAME})
+
+    # Path to precompiled libraries with known CentOS 7 ABI.
+    set(LIBDIR_CENTOS7_ABI ${CMAKE_SOURCE_DIR}/../lib/linux_centos7_x86_64)
+
+    # Choose the best suitable libraries.
+    if(EXISTS ${LIBDIR_NATIVE_ABI})
+      set(_cycles_lib_dir ${LIBDIR_NATIVE_ABI})
+    elseif(EXISTS ${LIBDIR_CENTOS7_ABI})
+      set(_cycles_lib_dir ${LIBDIR_CENTOS7_ABI})
+      set(WITH_CXX11_ABI OFF)
+
+      if(CMAKE_COMPILER_IS_GNUCC AND
+         CMAKE_C_COMPILER_VERSION VERSION_LESS 9.3)
+        message(FATAL_ERROR "GCC version must be at least 9.3 for precompiled libraries, found ${CMAKE_C_COMPILER_VERSION}")
+      endif()
+    endif()
+
+    if(DEFINED _cycles_lib_dir)
+      message(STATUS "Using precompiled libraries at ${_cycles_lib_dir}")
+    endif()
+
+    # Avoid namespace pollustion.
+    unset(LIBDIR_NATIVE_ABI)
+    unset(LIBDIR_CENTOS7_ABI)
+  endif()
+
+  if(EXISTS ${_cycles_lib_dir})
+    _set_default(ALEMBIC_ROOT_DIR "${_cycles_lib_dir}/alembic")
+    _set_default(BOOST_ROOT "${_cycles_lib_dir}/boost")
+    _set_default(BLOSC_ROOT_DIR "${_cycles_lib_dir}/blosc")
+    _set_default(EMBREE_ROOT_DIR "${_cycles_lib_dir}/embree")
+    _set_default(EPOXY_ROOT_DIR "${_cycles_lib_dir}/epoxy")
+    _set_default(IMATH_ROOT_DIR "${_cycles_lib_dir}/imath")
+    _set_default(GLEW_ROOT_DIR "${_cycles_lib_dir}/glew")
+    _set_default(JPEG_ROOT "${_cycles_lib_dir}/jpeg")
+    _set_default(LLVM_ROOT_DIR "${_cycles_lib_dir}/llvm")
+    _set_default(CLANG_ROOT_DIR "${_cycles_lib_dir}/llvm")
+    _set_default(NANOVDB_ROOT_DIR "${_cycles_lib_dir}/openvdb")
+    _set_default(OPENCOLORIO_ROOT_DIR "${_cycles_lib_dir}/opencolorio")
+    _set_default(OPENEXR_ROOT_DIR "${_cycles_lib_dir}/openexr")
+    _set_default(OPENIMAGEDENOISE_ROOT_DIR "${_cycles_lib_dir}/openimagedenoise")
+    _set_default(OPENIMAGEIO_ROOT_DIR "${_cycles_lib_dir}/openimageio")
+    _set_default(OPENJPEG_ROOT_DIR "${_cycles_lib_dir}/openjpeg")
+    _set_default(OPENSUBDIV_ROOT_DIR "${_cycles_lib_dir}/opensubdiv")
+    _set_default(OPENVDB_ROOT_DIR "${_cycles_lib_dir}/openvdb")
+    _set_default(OSL_ROOT_DIR "${_cycles_lib_dir}/osl")
+    _set_default(PNG_ROOT "${_cycles_lib_dir}/png")
+    _set_default(PUGIXML_ROOT_DIR "${_cycles_lib_dir}/pugixml")
+    _set_default(SDL2_ROOT_DIR "${_cycles_lib_dir}/sdl")
+    _set_default(TBB_ROOT_DIR "${_cycles_lib_dir}/tbb")
+    _set_default(TIFF_ROOT "${_cycles_lib_dir}/tiff")
+    _set_default(USD_ROOT_DIR "${_cycles_lib_dir}/usd")
+    _set_default(WEBP_ROOT_DIR "${_cycles_lib_dir}/webp")
+    _set_default(ZLIB_ROOT "${_cycles_lib_dir}/zlib")
+    if(WIN32)
+      set(LEVEL_ZERO_ROOT_DIR ${_cycles_lib_dir}/level_zero)
+    else()
+      set(LEVEL_ZERO_ROOT_DIR ${_cycles_lib_dir}/level-zero)
+    endif()
+    _set_default(SYCL_ROOT_DIR "${_cycles_lib_dir}/dpcpp")
+
+    # Ignore system libraries
+    set(CMAKE_IGNORE_PATH "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES};${CMAKE_SYSTEM_INCLUDE_PATH};${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES};${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}")
+  else()
+    unset(_cycles_lib_dir)
+  endif()
+endif()
+
+###########################################################################
+# Zlib
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY)
+  if(MSVC AND EXISTS ${_cycles_lib_dir})
+    set(ZLIB_INCLUDE_DIRS ${_cycles_lib_dir}/zlib/include)
+    set(ZLIB_LIBRARIES ${_cycles_lib_dir}/zlib/lib/libz_st.lib)
+    set(ZLIB_INCLUDE_DIR ${_cycles_lib_dir}/zlib/include)
+    set(ZLIB_LIBRARY ${_cycles_lib_dir}/zlib/lib/libz_st.lib)
+    set(ZLIB_DIR ${_cycles_lib_dir}/zlib)
+    set(ZLIB_FOUND ON)
+  elseif(NOT APPLE)
+    find_package(ZLIB REQUIRED)
+  endif()
+endif()
+
+###########################################################################
+# PThreads
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY)
+  if(MSVC AND EXISTS ${_cycles_lib_dir})
+    set(PTHREADS_LIBRARIES "${_cycles_lib_dir}/pthreads/lib/pthreadVC3.lib")
+    include_directories("${_cycles_lib_dir}/pthreads/include")
+  else()
+    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+    find_package(Threads REQUIRED)
+    set(PTHREADS_LIBRARIES ${CMAKE_THREAD_LIBS_INIT})
+  endif()
+endif()
+
+###########################################################################
+# OpenImageIO and image libraries
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY)
+  if(MSVC AND EXISTS ${_cycles_lib_dir})
+    add_definitions(
+      # OIIO changed the name of this define in newer versions
+      # we define both, so it would work with both old and new
+      # versions.
+      -DOIIO_STATIC_BUILD
+      -DOIIO_STATIC_DEFINE
+    )
+
+    set(OPENIMAGEIO_INCLUDE_DIR ${OPENIMAGEIO_ROOT_DIR}/include)
+    set(OPENIMAGEIO_INCLUDE_DIRS ${OPENIMAGEIO_INCLUDE_DIR} ${OPENIMAGEIO_INCLUDE_DIR}/OpenImageIO)
+    # Special exceptions for libraries which needs explicit debug version
+    set(OPENIMAGEIO_LIBRARIES
+      optimized ${OPENIMAGEIO_ROOT_DIR}/lib/OpenImageIO.lib
+      optimized ${OPENIMAGEIO_ROOT_DIR}/lib/OpenImageIO_Util.lib
+      debug ${OPENIMAGEIO_ROOT_DIR}/lib/OpenImageIO_d.lib
+      debug ${OPENIMAGEIO_ROOT_DIR}/lib/OpenImageIO_Util_d.lib
+    )
+
+    set(PUGIXML_INCLUDE_DIR ${PUGIXML_ROOT_DIR}/include)
+    set(PUGIXML_LIBRARIES
+      optimized ${PUGIXML_ROOT_DIR}/lib/pugixml.lib
+      debug ${PUGIXML_ROOT_DIR}/lib/pugixml_d.lib
+    )
+  else()
+    find_package(OpenImageIO REQUIRED)
+    if(OPENIMAGEIO_PUGIXML_FOUND)
+      set(PUGIXML_INCLUDE_DIR "${OPENIMAGEIO_INCLUDE_DIR}/OpenImageIO")
+      set(PUGIXML_LIBRARIES "")
+    else()
+      find_package(PugiXML REQUIRED)
+    endif()
+  endif()
+
+  # Dependencies
+  if(MSVC AND EXISTS ${_cycles_lib_dir})
+    set(OPENJPEG_INCLUDE_DIR ${OPENJPEG}/include/openjpeg-2.3)
+    set(OPENJPEG_LIBRARIES ${_cycles_lib_dir}/openjpeg/lib/openjp2${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    find_package(OpenJPEG REQUIRED)
+  endif()
+
+  find_package(JPEG REQUIRED)
+  find_package(TIFF REQUIRED)
+  find_package(WebP)
+
+  if(EXISTS ${_cycles_lib_dir})
+    set(PNG_NAMES png16 libpng16 png libpng)
+  endif()
+  find_package(PNG REQUIRED)
+endif()
+
+###########################################################################
+# OpenEXR
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY)
+  if(MSVC AND EXISTS ${_cycles_lib_dir})
+    set(OPENEXR_INCLUDE_DIR ${OPENEXR_ROOT_DIR}/include)
+    set(OPENEXR_INCLUDE_DIRS ${OPENEXR_INCLUDE_DIR} ${OPENEXR_ROOT_DIR}/include/OpenEXR ${IMATH_ROOT_DIR}/include ${IMATH_ROOT_DIR}/include/Imath)
+    set(OPENEXR_LIBRARIES
+      optimized ${OPENEXR_ROOT_DIR}/lib/OpenEXR_s.lib
+      optimized ${OPENEXR_ROOT_DIR}/lib/OpenEXRCore_s.lib
+      optimized ${OPENEXR_ROOT_DIR}/lib/Iex_s.lib
+      optimized ${IMATH_ROOT_DIR}/lib/Imath_s.lib
+      optimized ${OPENEXR_ROOT_DIR}/lib/IlmThread_s.lib
+      debug ${OPENEXR_ROOT_DIR}/lib/OpenEXR_s_d.lib
+      debug ${OPENEXR_ROOT_DIR}/lib/OpenEXRCore_s_d.lib
+      debug ${OPENEXR_ROOT_DIR}/lib/Iex_s_d.lib
+      debug ${IMATH_ROOT_DIR}/lib/Imath_s_d.lib
+      debug ${OPENEXR_ROOT_DIR}/lib/IlmThread_s_d.lib
+    )
+  else()
+    find_package(OpenEXR REQUIRED)
+  endif()
+endif()
+
+###########################################################################
+# OpenShadingLanguage & LLVM
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_OSL)
+  if(EXISTS ${_cycles_lib_dir})
+    set(LLVM_STATIC ON)
+  endif()
+
+  if(MSVC AND EXISTS ${_cycles_lib_dir})
+    # TODO(sergey): On Windows llvm-config doesn't give proper results for the
+    # library names, use hardcoded libraries for now.
+    file(GLOB _llvm_libs_release ${LLVM_ROOT_DIR}/lib/*.lib)
+    file(GLOB _llvm_libs_debug ${LLVM_ROOT_DIR}/debug/lib/*.lib)
+    set(_llvm_libs)
+    foreach(_llvm_lib_path ${_llvm_libs_release})
+      get_filename_component(_llvm_lib_name ${_llvm_lib_path} ABSOLUTE)
+      list(APPEND _llvm_libs optimized ${_llvm_lib_name})
+    endforeach()
+    foreach(_llvm_lib_path ${_llvm_libs_debug})
+      get_filename_component(_llvm_lib_name ${_llvm_lib_path} ABSOLUTE)
+      list(APPEND _llvm_libs debug ${_llvm_lib_name})
+    endforeach()
+    set(LLVM_LIBRARY ${_llvm_libs})
+    unset(_llvm_lib_name)
+    unset(_llvm_lib_path)
+    unset(_llvm_libs)
+    unset(_llvm_libs_debug)
+    unset(_llvm_libs_release)
+
+    set(OSL_INCLUDE_DIR ${OSL_ROOT_DIR}/include)
+    set(OSL_LIBRARIES
+      optimized ${OSL_ROOT_DIR}/lib/oslcomp.lib
+      optimized ${OSL_ROOT_DIR}/lib/oslexec.lib
+      optimized ${OSL_ROOT_DIR}/lib/oslquery.lib
+      debug ${OSL_ROOT_DIR}/lib/oslcomp_d.lib
+      debug ${OSL_ROOT_DIR}/lib/oslexec_d.lib
+      debug ${OSL_ROOT_DIR}/lib/oslquery_d.lib
+      ${PUGIXML_LIBRARIES}
+    )
+
+    find_program(OSL_COMPILER NAMES oslc PATHS ${OSL_ROOT_DIR}/bin)
+  else()
+    find_package(OSL REQUIRED)
+    find_package(LLVM REQUIRED)
+    find_package(Clang REQUIRED)
+  endif()
+endif()
+
+###########################################################################
+# OpenPGL
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_PATH_GUIDING)
+  if(NOT openpgl_DIR AND EXISTS ${_cycles_lib_dir})
+    set(openpgl_DIR ${_cycles_lib_dir}/openpgl/lib/cmake/openpgl)
+  endif()
+
+  find_package(openpgl QUIET)
+  if(openpgl_FOUND)
+    if(WIN32)
+      get_target_property(OPENPGL_LIBRARIES_RELEASE openpgl::openpgl LOCATION_RELEASE)
+      get_target_property(OPENPGL_LIBRARIES_DEBUG openpgl::openpgl LOCATION_DEBUG)
+      set(OPENPGL_LIBRARIES optimized ${OPENPGL_LIBRARIES_RELEASE} debug ${OPENPGL_LIBRARIES_DEBUG})
+    else()
+      get_target_property(OPENPGL_LIBRARIES openpgl::openpgl LOCATION)
+    endif()
+    get_target_property(OPENPGL_INCLUDE_DIR openpgl::openpgl INTERFACE_INCLUDE_DIRECTORIES)
+  else()
+    set_and_warn_library_found("OpenPGL" openpgl_FOUND WITH_CYCLES_PATH_GUIDING)
+  endif()
+endif()
+
+###########################################################################
+# OpenColorIO
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_OPENCOLORIO)
+  set(WITH_OPENCOLORIO ON)
+
+  if(NOT USD_OVERRIDE_OPENCOLORIO)
+    if(MSVC AND EXISTS ${_cycles_lib_dir})
+      set(OPENCOLORIO_INCLUDE_DIRS ${OPENCOLORIO_ROOT_DIR}/include)
+      set(OPENCOLORIO_LIBRARIES
+        optimized ${OPENCOLORIO_ROOT_DIR}/lib/OpenColorIO.lib
+        optimized ${OPENCOLORIO_ROOT_DIR}/lib/libyaml-cpp.lib
+        optimized ${OPENCOLORIO_ROOT_DIR}/lib/libexpatMD.lib
+        optimized ${OPENCOLORIO_ROOT_DIR}/lib/pystring.lib
+        debug ${OPENCOLORIO_ROOT_DIR}/lib/OpencolorIO_d.lib
+        debug ${OPENCOLORIO_ROOT_DIR}/lib/libyaml-cpp_d.lib
+        debug ${OPENCOLORIO_ROOT_DIR}/lib/libexpatdMD.lib
+        debug ${OPENCOLORIO_ROOT_DIR}/lib/pystring_d.lib
+      )
+    else()
+      find_package(OpenColorIO REQUIRED)
+    endif()
+  endif()
+endif()
+
+###########################################################################
+# Boost
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY)
+  if(EXISTS ${_cycles_lib_dir})
+    if(MSVC)
+      set(Boost_USE_STATIC_RUNTIME OFF)
+      set(Boost_USE_MULTITHREADED ON)
+      set(Boost_USE_STATIC_LIBS ON)
+    else()
+      set(BOOST_LIBRARYDIR ${_cycles_lib_dir}/boost/lib)
+      set(Boost_NO_BOOST_CMAKE ON)
+      set(Boost_NO_SYSTEM_PATHS ON)
+    endif()
+  endif()
+
+  if(MSVC AND EXISTS ${_cycles_lib_dir})
+    set(BOOST_INCLUDE_DIR ${BOOST_ROOT}/include)
+    set(BOOST_VERSION_HEADER ${BOOST_INCLUDE_DIR}/boost/version.hpp)
+    if(EXISTS ${BOOST_VERSION_HEADER})
+      file(STRINGS "${BOOST_VERSION_HEADER}" BOOST_LIB_VERSION REGEX "#define BOOST_LIB_VERSION ")
+      if(BOOST_LIB_VERSION MATCHES "#define BOOST_LIB_VERSION \"([0-9_]+)\"")
+        set(BOOST_VERSION "${CMAKE_MATCH_1}")
+      endif()
+    endif()
+    if(NOT BOOST_VERSION)
+      message(FATAL_ERROR "Unable to determine Boost version")
+    endif()
+    set(BOOST_POSTFIX "vc142-mt-x64-${BOOST_VERSION}.lib")
+    set(BOOST_DEBUG_POSTFIX "vc142-mt-gd-x64-${BOOST_VERSION}.lib")
+    set(BOOST_LIBRARIES
+      optimized ${BOOST_ROOT}/lib/libboost_date_time-${BOOST_POSTFIX}
+      optimized ${BOOST_ROOT}/lib/libboost_iostreams-${BOOST_POSTFIX}
+      optimized ${BOOST_ROOT}/lib/libboost_filesystem-${BOOST_POSTFIX}
+      optimized ${BOOST_ROOT}/lib/libboost_regex-${BOOST_POSTFIX}
+      optimized ${BOOST_ROOT}/lib/libboost_system-${BOOST_POSTFIX}
+      optimized ${BOOST_ROOT}/lib/libboost_thread-${BOOST_POSTFIX}
+      optimized ${BOOST_ROOT}/lib/libboost_chrono-${BOOST_POSTFIX}
+      debug ${BOOST_ROOT}/lib/libboost_date_time-${BOOST_DEBUG_POSTFIX}
+      debug ${BOOST_ROOT}/lib/libboost_iostreams-${BOOST_DEBUG_POSTFIX}
+      debug ${BOOST_ROOT}/lib/libboost_filesystem-${BOOST_DEBUG_POSTFIX}
+      debug ${BOOST_ROOT}/lib/libboost_regex-${BOOST_DEBUG_POSTFIX}
+      debug ${BOOST_ROOT}/lib/libboost_system-${BOOST_DEBUG_POSTFIX}
+      debug ${BOOST_ROOT}/lib/libboost_thread-${BOOST_DEBUG_POSTFIX}
+      debug ${BOOST_ROOT}/lib/libboost_chrono-${BOOST_DEBUG_POSTFIX}
+    )
+    if(WITH_CYCLES_OSL)
+      set(BOOST_LIBRARIES ${BOOST_LIBRARIES}
+        optimized ${BOOST_ROOT}/lib/libboost_wave-${BOOST_POSTFIX}
+        debug ${BOOST_ROOT}/lib/libboost_wave-${BOOST_DEBUG_POSTFIX})
+    endif()
+  else()
+    set(__boost_packages iostreams filesystem regex system thread date_time)
+    if(WITH_CYCLES_OSL)
+      list(APPEND __boost_packages wave)
+    endif()
+    find_package(Boost 1.48 COMPONENTS ${__boost_packages} REQUIRED)
+    if(NOT Boost_FOUND)
+      # Try to find non-multithreaded if -mt not found, this flag
+      # doesn't matter for us, it has nothing to do with thread
+      # safety, but keep it to not disturb build setups.
+      set(Boost_USE_MULTITHREADED OFF)
+      find_package(Boost 1.48 COMPONENTS ${__boost_packages})
+    endif()
+    unset(__boost_packages)
+
+    set(BOOST_INCLUDE_DIR ${Boost_INCLUDE_DIRS})
+    set(BOOST_LIBRARIES ${Boost_LIBRARIES})
+    set(BOOST_LIBPATH ${Boost_LIBRARY_DIRS})
+  endif()
+
+  set(BOOST_DEFINITIONS "-DBOOST_ALL_NO_LIB ${BOOST_DEFINITIONS}")
+endif()
+
+###########################################################################
+# Embree
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_EMBREE)
+  if(MSVC AND EXISTS ${_cycles_lib_dir})
+    set(EMBREE_INCLUDE_DIRS ${EMBREE_ROOT_DIR}/include)
+    set(EMBREE_LIBRARIES
+      optimized ${EMBREE_ROOT_DIR}/lib/embree3.lib
+      optimized ${EMBREE_ROOT_DIR}/lib/embree_avx2.lib
+      optimized ${EMBREE_ROOT_DIR}/lib/embree_avx.lib
+      optimized ${EMBREE_ROOT_DIR}/lib/embree_sse42.lib
+      optimized ${EMBREE_ROOT_DIR}/lib/lexers.lib
+      optimized ${EMBREE_ROOT_DIR}/lib/math.lib
+      optimized ${EMBREE_ROOT_DIR}/lib/simd.lib
+      optimized ${EMBREE_ROOT_DIR}/lib/tasking.lib
+      optimized ${EMBREE_ROOT_DIR}/lib/sys.lib
+      debug ${EMBREE_ROOT_DIR}/lib/embree3_d.lib
+      debug ${EMBREE_ROOT_DIR}/lib/embree_avx2_d.lib
+      debug ${EMBREE_ROOT_DIR}/lib/embree_avx_d.lib
+      debug ${EMBREE_ROOT_DIR}/lib/embree_sse42_d.lib
+      debug ${EMBREE_ROOT_DIR}/lib/lexers_d.lib
+      debug ${EMBREE_ROOT_DIR}/lib/math_d.lib
+      debug ${EMBREE_ROOT_DIR}/lib/simd_d.lib
+      debug ${EMBREE_ROOT_DIR}/lib/sys_d.lib
+      debug ${EMBREE_ROOT_DIR}/lib/tasking_d.lib
+    )
+  else()
+    find_package(Embree 3.8.0 REQUIRED)
+  endif()
+endif()
+
+###########################################################################
+# Logging
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_LOGGING)
+  find_package(Glog REQUIRED)
+  find_package(Gflags REQUIRED)
+endif()
+
+###########################################################################
+# OpenSubdiv
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_OPENSUBDIV)
+  set(WITH_OPENSUBDIV ON)
+
+  if(NOT USD_OVERRIDE_OPENSUBDIV)
+    if(MSVC AND EXISTS ${_cycles_lib_dir})
+      set(OPENSUBDIV_INCLUDE_DIRS ${OPENSUBDIV_ROOT_DIR}/include)
+      set(OPENSUBDIV_LIBRARIES
+        optimized ${OPENSUBDIV_ROOT_DIR}/lib/osdCPU.lib
+        optimized ${OPENSUBDIV_ROOT_DIR}/lib/osdGPU.lib
+        debug ${OPENSUBDIV_ROOT_DIR}/lib/osdCPU_d.lib
+        debug ${OPENSUBDIV_ROOT_DIR}/lib/osdGPU_d.lib
+      )
+    else()
+      find_package(OpenSubdiv REQUIRED)
+    endif()
+  endif()
+endif()
+
+###########################################################################
+# OpenVDB
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_OPENVDB)
+  set(WITH_OPENVDB ON)
+  set(OPENVDB_DEFINITIONS -DNOMINMAX -D_USE_MATH_DEFINES)
+
+  if(NOT USD_OVERRIDE_OPENVDB)
+    find_package(OpenVDB REQUIRED)
+
+    if(MSVC AND EXISTS ${_cycles_lib_dir})
+      set(BLOSC_LIBRARY
+        optimized ${BLOSC_ROOT_DIR}/lib/libblosc.lib
+        debug ${BLOSC_ROOT_DIR}/lib/libblosc_d.lib
+      )
+    else()
+      find_package(Blosc REQUIRED)
+    endif()
+  endif()
+endif()
+
+###########################################################################
+# NanoVDB
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_NANOVDB)
+  set(WITH_NANOVDB ON)
+
+  if(MSVC AND EXISTS ${_cycles_lib_dir})
+    set(NANOVDB_INCLUDE_DIR ${NANOVDB_ROOT_DIR}/include)
+    set(NANOVDB_INCLUDE_DIRS ${NANOVDB_INCLUDE_DIR})
+  else()
+    find_package(NanoVDB REQUIRED)
+  endif()
+endif()
+
+###########################################################################
+# OpenImageDenoise
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY AND WITH_CYCLES_OPENIMAGEDENOISE)
+  set(WITH_OPENIMAGEDENOISE ON)
+
+  if(MSVC AND EXISTS ${_cycles_lib_dir})
+    set(OPENIMAGEDENOISE_INCLUDE_DIRS ${OPENIMAGEDENOISE_ROOT_DIR}/include)
+    set(OPENIMAGEDENOISE_LIBRARIES
+      optimized ${OPENIMAGEDENOISE_ROOT_DIR}/lib/OpenImageDenoise.lib
+      optimized ${OPENIMAGEDENOISE_ROOT_DIR}/lib/common.lib
+      optimized ${OPENIMAGEDENOISE_ROOT_DIR}/lib/dnnl.lib
+      debug ${OPENIMAGEDENOISE_ROOT_DIR}/lib/OpenImageDenoise_d.lib
+      debug ${OPENIMAGEDENOISE_ROOT_DIR}/lib/common_d.lib
+      debug ${OPENIMAGEDENOISE_ROOT_DIR}/lib/dnnl_d.lib
+    )
+  else()
+    find_package(OpenImageDenoise REQUIRED)
+  endif()
+endif()
+
+###########################################################################
+# TBB
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY)
+  if(NOT USD_OVERRIDE_TBB)
+    if(MSVC AND EXISTS ${_cycles_lib_dir})
+      set(TBB_INCLUDE_DIRS ${TBB_ROOT_DIR}/include)
+      set(TBB_LIBRARIES
+        optimized ${TBB_ROOT_DIR}/lib/tbb.lib
+        debug ${TBB_ROOT_DIR}/lib/tbb_debug.lib
+      )
+    else()
+      find_package(TBB REQUIRED)
+    endif()
+  endif()
+endif()
+
+###########################################################################
+# Epoxy
+###########################################################################
+
+if(CYCLES_STANDALONE_REPOSITORY)
+  if((WITH_CYCLES_STANDALONE AND WITH_CYCLES_STANDALONE_GUI) OR
+     WITH_CYCLES_HYDRA_RENDER_DELEGATE)
+    if(MSVC AND EXISTS ${_cycles_lib_dir})
+      set(Epoxy_LIBRARIES "${_cycles_lib_dir}/epoxy/lib/epoxy.lib")
+      set(Epoxy_INCLUDE_DIRS "${_cycles_lib_dir}/epoxy/include")
+    else()
+      find_package(Epoxy REQUIRED)
+    endif()
+  endif()
+endif()
+
+###########################################################################
+# Alembic
+###########################################################################
+
+if(WITH_CYCLES_ALEMBIC)
+  if(CYCLES_STANDALONE_REPOSITORY)
+    if(MSVC AND EXISTS ${_cycles_lib_dir})
+      set(ALEMBIC_INCLUDE_DIRS ${_cycles_lib_dir}/alembic/include)
+      set(ALEMBIC_LIBRARIES
+        optimized ${_cycles_lib_dir}/alembic/lib/Alembic.lib
+        debug ${_cycles_lib_dir}/alembic/lib/Alembic_d.lib)
+    else()
+      find_package(Alembic REQUIRED)
+    endif()
+
+    set(WITH_ALEMBIC ON)
+  endif()
+endif()
+
+###########################################################################
+# System Libraries
+###########################################################################
+
+# Detect system libraries again
+if(EXISTS ${_cycles_lib_dir})
+  unset(CMAKE_IGNORE_PATH)
+  unset(_cycles_lib_dir)
+endif()
+
 ###########################################################################
 # SDL
 ###########################################################################
@@ -109,3 +687,5 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
    set(WITH_CYCLES_DEVICE_ONEAPI OFF)
  endif()
 endif()
+
+unset(_cycles_lib_dir)
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -8,28 +8,13 @@ set(INC
 set(INC_SYS )

 if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
-  if(WITH_CUDA_DYNLOAD)
-    list(APPEND INC
-      ../../../extern/cuew/include
-    )
-    add_definitions(-DWITH_CUDA_DYNLOAD)
-  else()
-    list(APPEND INC_SYS
-      ${CUDA_TOOLKIT_INCLUDE}
-    )
+  if(NOT WITH_CUDA_DYNLOAD)
    add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
  endif()

  add_definitions(-DCYCLES_RUNTIME_OPTIX_ROOT_DIR="${CYCLES_RUNTIME_OPTIX_ROOT_DIR}")
 endif()

-if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
-  list(APPEND INC
-    ../../../extern/hipew/include
-  )
-  add_definitions(-DWITH_HIP_DYNLOAD)
-endif()
-
 set(SRC_BASE
  device.cpp
  denoise.cpp
@@ -168,24 +153,15 @@ if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
  )
 endif()

-if(WITH_CYCLES_DEVICE_CUDA)
-  add_definitions(-DWITH_CUDA)
-endif()
-if(WITH_CYCLES_DEVICE_HIP)
-  add_definitions(-DWITH_HIP)
-endif()
-if(WITH_CYCLES_DEVICE_OPTIX)
-  add_definitions(-DWITH_OPTIX)
-endif()
 if(WITH_CYCLES_DEVICE_METAL)
  list(APPEND LIB
    ${METAL_LIBRARY}
  )
-  add_definitions(-DWITH_METAL)
  list(APPEND SRC
    ${SRC_METAL}
  )
 endif()
+
 if (WITH_CYCLES_DEVICE_ONEAPI)
  if(WITH_CYCLES_ONEAPI_BINARIES)
    set(cycles_kernel_oneapi_lib_suffix "_aot")
@@ -203,7 +179,6 @@ if (WITH_CYCLES_DEVICE_ONEAPI)
  else()
    list(APPEND LIB ${SYCL_LIBRARY})
  endif()
-  add_definitions(-DWITH_ONEAPI)
  list(APPEND SRC
    ${SRC_ONEAPI}
  )
--- a/intern/cycles/device/denoise.h
+++ b/intern/cycles/device/denoise.h
@@ -78,24 +78,4 @@ class DenoiseParams : public Node {
  }
 };

-/* All the parameters needed to perform buffer denoising on a device.
- * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
- * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
- * single place where they are all listed, so that it's not required to modify all device methods
- * when these parameters do change. */
-class DeviceDenoiseTask {
- public:
-  DenoiseParams params;
-
-  int num_samples;
-
-  RenderBuffers *render_buffers;
-  BufferParams buffer_params;
-
-  /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
-   * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
-   * tracer) point of view. */
-  bool allow_inplace_modification;
-};
-
 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -160,6 +160,11 @@ class Device {
    return true;
  }

+  virtual bool load_osl_kernels()
+  {
+    return true;
+  }
+
  /* GPU device only functions.
   * These may not be used on CPU or multi-devices. */

@@ -228,21 +233,6 @@ class Device {
    return nullptr;
  }

-  /* Buffer denoising. */
-
-  /* Returns true if task is fully handled. */
-  virtual bool denoise_buffer(const DeviceDenoiseTask & /*task*/)
-  {
-    LOG(ERROR) << "Request buffer denoising from a device which does not support it.";
-    return false;
-  }
-
-  virtual DeviceQueue *get_denoise_queue()
-  {
-    LOG(ERROR) << "Request denoising queue from a device which does not support it.";
-    return nullptr;
-  }
-
  /* Sub-devices */

  /* Run given callback for every individual device which will be handling rendering.
--- a/intern/cycles/device/kernel.cpp
+++ b/intern/cycles/device/kernel.cpp
@@ -7,6 +7,30 @@

 CCL_NAMESPACE_BEGIN

+bool device_kernel_has_shading(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+          kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY);
+}
+
+bool device_kernel_has_intersection(DeviceKernel kernel)
+{
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
+          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
+}
+
 const char *device_kernel_as_string(DeviceKernel kernel)
 {
  switch (kernel) {
--- a/intern/cycles/device/kernel.h
+++ b/intern/cycles/device/kernel.h
@@ -11,6 +11,9 @@

 CCL_NAMESPACE_BEGIN

+bool device_kernel_has_shading(DeviceKernel kernel);
+bool device_kernel_has_intersection(DeviceKernel kernel);
+
 const char *device_kernel_as_string(DeviceKernel kernel);
 std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);

--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -45,6 +45,36 @@ bool kernel_has_intersection(DeviceKernel device_kernel)
 struct ShaderCache {
  ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
  {
+    /* Initialize occupancy tuning LUT. */
+    if (MetalInfo::get_device_vendor(mtlDevice) == METAL_GPU_APPLE) {
+      switch (MetalInfo::get_apple_gpu_architecture(mtlDevice)) {
+        default:
+        case APPLE_M2:
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {32, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {832, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {64, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {64, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {704, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {1024, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {64, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {256, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {448, 384};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {1024, 1024};
+          break;
+        case APPLE_M1:
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES] = {256, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA] = {768, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST] = {512, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW] = {384, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE] = {512, 64};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY] = {512, 256};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND] = {512, 128};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW] = {384, 32};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] = {576, 384};
+          occupancy_tuning[DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY] = {832, 832};
+          break;
+      }
+    }
  }
  ~ShaderCache();

@@ -73,6 +103,11 @@ struct ShaderCache {
    std::function<void(MetalKernelPipeline *)> completionHandler;
  };

+  struct OccupancyTuningParameters {
+    int threads_per_threadgroup = 0;
+    int num_threads_per_block = 0;
+  } occupancy_tuning[DEVICE_KERNEL_NUM];
+
  std::mutex cache_mutex;

  PipelineCollection pipelines[DEVICE_KERNEL_NUM];
@@ -230,6 +265,13 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
  request.pipeline->device_kernel = device_kernel;
  request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;

+  if (occupancy_tuning[device_kernel].threads_per_threadgroup) {
+    request.pipeline->threads_per_threadgroup =
+        occupancy_tuning[device_kernel].threads_per_threadgroup;
+    request.pipeline->num_threads_per_block =
+        occupancy_tuning[device_kernel].num_threads_per_block;
+  }
+
  /* metalrt options */
  request.pipeline->use_metalrt = device->use_metalrt;
  request.pipeline->metalrt_features = device->use_metalrt ?
@@ -384,13 +426,6 @@ void MetalKernelPipeline::compile()
  const std::string function_name = std::string("cycles_metal_") +
                                    device_kernel_as_string(device_kernel);

-  int threads_per_threadgroup = this->threads_per_threadgroup;
-  if (device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL &&
-      device_kernel < DEVICE_KERNEL_INTEGRATOR_RESET) {
-    /* Always use 512 for the sorting kernels */
-    threads_per_threadgroup = 512;
-  }
-
  NSString *entryPoint = [@(function_name.c_str()) copy];

  NSError *error = NULL;
@@ -601,7 +636,9 @@ void MetalKernelPipeline::compile()
    metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
    path_create_directories(metalbin_path);

-    if (path_exists(metalbin_path) && use_binary_archive) {
+    /* Retrieve shader binary from disk, and update the file timestamp for LRU purging to work as
+     * intended. */
+    if (use_binary_archive && path_cache_kernel_exists_and_mark_used(metalbin_path)) {
      if (@available(macOS 11.0, *)) {
        MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
        archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
@@ -662,12 +699,14 @@ void MetalKernelPipeline::compile()
      return;
    }

-    int num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
-                                           computePipelineState.threadExecutionWidth);
-    num_threads_per_block = std::max(num_threads_per_block,
-                                     (int)computePipelineState.threadExecutionWidth);
+    if (!num_threads_per_block) {
+      num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
+                                         computePipelineState.threadExecutionWidth);
+      num_threads_per_block = std::max(num_threads_per_block,
+                                       (int)computePipelineState.threadExecutionWidth);
+    }
+
    this->pipeline = computePipelineState;
-    this->num_threads_per_block = num_threads_per_block;

    if (@available(macOS 11.0, *)) {
      if (creating_new_archive || recreate_archive) {
@@ -676,6 +715,9 @@ void MetalKernelPipeline::compile()
          metal_printf("Failed to save binary archive, error:\n%s\n",
                       [[error localizedDescription] UTF8String]);
        }
+        else {
+          path_cache_kernel_mark_added_and_clear_old(metalbin_path);
+        }
      }
    }
  };
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -138,6 +138,15 @@ class MultiDevice : public Device {
    return true;
  }

+  bool load_osl_kernels() override
+  {
+    foreach (SubDevice &sub, devices)
+      if (!sub.device->load_osl_kernels())
+        return false;
+
+    return true;
+  }
+
  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
  {
    /* Try to build and share a single acceleration structure, if possible */
@@ -204,10 +213,12 @@ class MultiDevice : public Device {

  virtual void *get_cpu_osl_memory() override
  {
-    if (devices.size() > 1) {
+    /* Always return the OSL memory of the CPU device (this works since the constructor above
+     * guarantees that CPU devices are always added to the back). */
+    if (devices.size() > 1 && devices.back().device->info.type != DEVICE_CPU) {
      return NULL;
    }
-    return devices.front().device->get_cpu_osl_memory();
+    return devices.back().device->get_cpu_osl_memory();
  }

  bool is_resident(device_ptr key, Device *sub_device) override
--- a/intern/cycles/device/oneapi/device.cpp
+++ b/intern/cycles/device/oneapi/device.cpp
@@ -31,6 +31,8 @@ bool device_oneapi_init()
   * improves stability as of intel/LLVM SYCL-nightly/20220529.
   * All these env variable can be set beforehand by end-users and
   * will in that case -not- be overwritten. */
+  /* By default, enable only Level-Zero and if all devices are allowed, also CUDA and HIP.
+   * OpenCL backend isn't currently well supported. */
 #  ifdef _WIN32
  if (getenv("SYCL_CACHE_PERSISTENT") == nullptr) {
    _putenv_s("SYCL_CACHE_PERSISTENT", "1");
@@ -39,7 +41,12 @@ bool device_oneapi_init()
    _putenv_s("SYCL_CACHE_THRESHOLD", "0");
  }
  if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
-    _putenv_s("SYCL_DEVICE_FILTER", "level_zero");
+    if (getenv("CYCLES_ONEAPI_ALL_DEVICES") == nullptr) {
+      _putenv_s("SYCL_DEVICE_FILTER", "level_zero");
+    }
+    else {
+      _putenv_s("SYCL_DEVICE_FILTER", "level_zero,cuda,hip");
+    }
  }
  if (getenv("SYCL_ENABLE_PCI") == nullptr) {
    _putenv_s("SYCL_ENABLE_PCI", "1");
@@ -50,7 +57,12 @@ bool device_oneapi_init()
 #  elif __linux__
  setenv("SYCL_CACHE_PERSISTENT", "1", false);
  setenv("SYCL_CACHE_THRESHOLD", "0", false);
-  setenv("SYCL_DEVICE_FILTER", "level_zero", false);
+  if (getenv("CYCLES_ONEAPI_ALL_DEVICES") == nullptr) {
+    setenv("SYCL_DEVICE_FILTER", "level_zero", false);
+  }
+  else {
+    setenv("SYCL_DEVICE_FILTER", "level_zero,cuda,hip", false);
+  }
  setenv("SYCL_ENABLE_PCI", "1", false);
  setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
 #  endif
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -430,9 +430,9 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
  sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
  (void)usm_type;
  assert(usm_type == sycl::usm::alloc::device ||
-         ((device_type == sycl::info::device_type::cpu || allow_host) &&
-              usm_type == sycl::usm::alloc::host ||
-          usm_type == sycl::usm::alloc::unknown));
+         (usm_type == sycl::usm::alloc::host &&
+          (allow_host || device_type == sycl::info::device_type::cpu)) ||
+         usm_type == sycl::usm::alloc::unknown);
 #  else
  /* Silence warning about unused arguments. */
  (void)queue_;
--- a/intern/cycles/device/optix/device.cpp
+++ b/intern/cycles/device/optix/device.cpp
@@ -9,6 +9,10 @@

 #include "util/log.h"

+#ifdef WITH_OSL
+#  include <OSL/oslversion.h>
+#endif
+
 #ifdef WITH_OPTIX
 #  include <optix_function_table_definition.h>
 #endif
@@ -65,6 +69,9 @@ void device_optix_info(const vector<DeviceInfo> &cuda_devices, vector<DeviceInfo

    info.type = DEVICE_OPTIX;
    info.id += "_OptiX";
+#  if defined(WITH_OSL) && (OSL_VERSION_MINOR >= 13 || OSL_VERSION_MAJOR > 1)
+    info.has_osl = true;
+#  endif
    info.denoisers |= DENOISER_OPTIX;

    devices.push_back(info);
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -1,16 +1,14 @@
 /* SPDX-License-Identifier: Apache-2.0
- * Copyright 2019, NVIDIA Corporation.
- * Copyright 2019-2022 Blender Foundation. */
+ * Copyright 2019, NVIDIA Corporation
+ * Copyright 2019-2022 Blender Foundation */

 #pragma once

 #ifdef WITH_OPTIX

 #  include "device/cuda/device_impl.h"
-#  include "device/optix/queue.h"
 #  include "device/optix/util.h"
-#  include "kernel/types.h"
-#  include "util/unique_ptr.h"
+#  include "kernel/osl/globals.h"

 CCL_NAMESPACE_BEGIN

@@ -23,8 +21,16 @@ enum {
  PG_RGEN_INTERSECT_SHADOW,
  PG_RGEN_INTERSECT_SUBSURFACE,
  PG_RGEN_INTERSECT_VOLUME_STACK,
+  PG_RGEN_SHADE_BACKGROUND,
+  PG_RGEN_SHADE_LIGHT,
+  PG_RGEN_SHADE_SURFACE,
  PG_RGEN_SHADE_SURFACE_RAYTRACE,
  PG_RGEN_SHADE_SURFACE_MNEE,
+  PG_RGEN_SHADE_VOLUME,
+  PG_RGEN_SHADE_SHADOW,
+  PG_RGEN_EVAL_DISPLACE,
+  PG_RGEN_EVAL_BACKGROUND,
+  PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY,
  PG_MISS,
  PG_HITD, /* Default hit group. */
  PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
@@ -40,14 +46,14 @@ enum {
 };

 static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
-static const int NUM_MIS_PROGRAM_GROUPS = 1;
+static const int NUM_MISS_PROGRAM_GROUPS = 1;
 static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
 static const int NUM_HIT_PROGRAM_GROUPS = 8;
 static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
 static const int NUM_CALLABLE_PROGRAM_GROUPS = 2;

 /* List of OptiX pipelines. */
-enum { PIP_SHADE_RAYTRACE, PIP_SHADE_MNEE, PIP_INTERSECT, NUM_PIPELINES };
+enum { PIP_SHADE, PIP_INTERSECT, NUM_PIPELINES };

 /* A single shader binding table entry. */
 struct SbtRecord {
@@ -61,52 +67,35 @@ class OptiXDevice : public CUDADevice {
  OptixModule optix_module = NULL; /* All necessary OptiX kernels are in one module. */
  OptixModule builtin_modules[2] = {};
  OptixPipeline pipelines[NUM_PIPELINES] = {};
+  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
+  OptixPipelineCompileOptions pipeline_options = {};

-  bool motion_blur = false;
  device_vector<SbtRecord> sbt_data;
  device_only_memory<KernelParamsOptiX> launch_params;
-  OptixTraversableHandle tlas_handle = 0;

+#  ifdef WITH_OSL
+  OSLGlobals osl_globals;
+  vector<OptixModule> osl_modules;
+  vector<OptixProgramGroup> osl_groups;
+#  endif
+
+ private:
+  OptixTraversableHandle tlas_handle = 0;
  vector<unique_ptr<device_only_memory<char>>> delayed_free_bvh_memory;
  thread_mutex delayed_free_bvh_mutex;

-  class Denoiser {
-   public:
-    explicit Denoiser(OptiXDevice *device);
-
-    OptiXDevice *device;
-    OptiXDeviceQueue queue;
-
-    OptixDenoiser optix_denoiser = nullptr;
-
-    /* Configuration size, as provided to `optixDenoiserSetup`.
-     * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
-     * `is_configured` will be false. */
-    bool is_configured = false;
-    int2 configured_size = make_int2(0, 0);
-
-    /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
-     * The memory layout goes as following: [denoiser state][scratch buffer]. */
-    device_only_memory<unsigned char> state;
-    OptixDenoiserSizes sizes = {};
-
-    bool use_pass_albedo = false;
-    bool use_pass_normal = false;
-    bool use_pass_flow = false;
-  };
-  Denoiser denoiser_;
-
 public:
  OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
  ~OptiXDevice();

- private:
  BVHLayoutMask get_bvh_layout_mask() const override;

  string compile_kernel_get_common_cflags(const uint kernel_features);

  bool load_kernels(const uint kernel_features) override;

+  bool load_osl_kernels() override;
+
  bool build_optix_bvh(BVHOptiX *bvh,
                       OptixBuildOperation operation,
                       const OptixBuildInput &build_input,
@@ -123,52 +112,7 @@ class OptiXDevice : public CUDADevice {

  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;

-  /* --------------------------------------------------------------------
-   * Denoising.
-   */
-
-  class DenoiseContext;
-  class DenoisePass;
-
-  virtual bool denoise_buffer(const DeviceDenoiseTask &task) override;
-  virtual DeviceQueue *get_denoise_queue() override;
-
-  /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
-   * OptiX and store in the guiding passes memory within the given context.
-   *
-   * Pre=-processing of the guiding passes is to only happen once per context lifetime. DO not
-   * preprocess them for every pass which is being denoised. */
-  bool denoise_filter_guiding_preprocess(DenoiseContext &context);
-
-  /* Set fake albedo pixels in the albedo guiding pass storage.
-   * After this point only passes which do not need albedo for denoising can be processed. */
-  bool denoise_filter_guiding_set_fake_albedo(DenoiseContext &context);
-
-  void denoise_pass(DenoiseContext &context, PassType pass_type);
-
-  /* Read input color pass from the render buffer into the memory which corresponds to the noisy
-   * input within the given context. Pixels are scaled to the number of samples, but are not
-   * preprocessed yet. */
-  void denoise_color_read(DenoiseContext &context, const DenoisePass &pass);
-
-  /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
-   * denoiser result to the render buffer. */
-  bool denoise_filter_color_preprocess(DenoiseContext &context, const DenoisePass &pass);
-  bool denoise_filter_color_postprocess(DenoiseContext &context, const DenoisePass &pass);
-
-  /* Make sure the OptiX denoiser is created and configured. */
-  bool denoise_ensure(DenoiseContext &context);
-
-  /* Create OptiX denoiser descriptor if needed.
-   * Will do nothing if the current OptiX descriptor is usable for the given parameters.
-   * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
-  bool denoise_create_if_needed(DenoiseContext &context);
-
-  /* Configure existing OptiX denoiser descriptor for the use for the given task. */
-  bool denoise_configure_if_needed(DenoiseContext &context);
-
-  /* Run configured denoiser. */
-  bool denoise_run(DenoiseContext &context, const DenoisePass &pass);
+  void *get_cpu_osl_memory() override;
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -24,21 +24,33 @@ void OptiXDeviceQueue::init_execution()
  CUDADeviceQueue::init_execution();
 }

-static bool is_optix_specific_kernel(DeviceKernel kernel)
+static bool is_optix_specific_kernel(DeviceKernel kernel, bool use_osl)
 {
-  return (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+#  ifdef WITH_OSL
+  /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
+  if (use_osl && device_kernel_has_shading(kernel)) {
+    return true;
+  }
+#  else
+  (void)use_osl;
+#  endif
+
+  return device_kernel_has_intersection(kernel);
 }

 bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                               const int work_size,
                               DeviceKernelArguments const &args)
 {
-  if (!is_optix_specific_kernel(kernel)) {
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
+
+#  ifdef WITH_OSL
+  const bool use_osl = static_cast<OSLGlobals *>(optix_device->get_cpu_osl_memory())->use;
+#  else
+  const bool use_osl = false;
+#  endif
+
+  if (!is_optix_specific_kernel(kernel, use_osl)) {
    return CUDADeviceQueue::enqueue(kernel, work_size, args);
  }

@@ -50,8 +62,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,

  const CUDAContextScope scope(cuda_device_);

-  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(cuda_device_);
-
  const device_ptr sbt_data_ptr = optix_device->sbt_data.device_pointer;
  const device_ptr launch_params_ptr = optix_device->launch_params.device_pointer;

@@ -62,9 +72,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                        sizeof(device_ptr),
                        cuda_stream_));

-  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
-      kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE) {
+  if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST || device_kernel_has_shading(kernel)) {
    cuda_device_assert(
        cuda_device_,
        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
@@ -72,6 +80,15 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
                          sizeof(device_ptr),
                          cuda_stream_));
  }
+  if (kernel == DEVICE_KERNEL_SHADER_EVAL_DISPLACE ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_BACKGROUND ||
+      kernel == DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY) {
+    cuda_device_assert(cuda_device_,
+                       cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, offset),
+                                         args.values[2],  // &d_offset
+                                         sizeof(int32_t),
+                                         cuda_stream_));
+  }

  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));

@@ -79,14 +96,35 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
  OptixShaderBindingTable sbt_params = {};

  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_LIGHT * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE * sizeof(SbtRecord);
+      break;
    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
-      pipeline = optix_device->pipelines[PIP_SHADE_RAYTRACE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_RAYTRACE * sizeof(SbtRecord);
      break;
    case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
-      pipeline = optix_device->pipelines[PIP_SHADE_MNEE];
+      pipeline = optix_device->pipelines[PIP_SHADE];
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SURFACE_MNEE * sizeof(SbtRecord);
      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_VOLUME * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_SHADE_SHADOW * sizeof(SbtRecord);
+      break;
+
    case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
      pipeline = optix_device->pipelines[PIP_INTERSECT];
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
@@ -104,6 +142,20 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_VOLUME_STACK * sizeof(SbtRecord);
      break;

+    case DEVICE_KERNEL_SHADER_EVAL_DISPLACE:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_DISPLACE * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_EVAL_BACKGROUND * sizeof(SbtRecord);
+      break;
+    case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY:
+      pipeline = optix_device->pipelines[PIP_SHADE];
+      sbt_params.raygenRecord = sbt_data_ptr +
+                                PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY * sizeof(SbtRecord);
+      break;
+
    default:
      LOG(ERROR) << "Invalid kernel " << device_kernel_as_string(kernel)
                 << " is attempted to be enqueued.";
@@ -112,7 +164,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,

  sbt_params.missRecordBase = sbt_data_ptr + MISS_PROGRAM_GROUP_OFFSET * sizeof(SbtRecord);
  sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
-  sbt_params.missRecordCount = NUM_MIS_PROGRAM_GROUPS;
+  sbt_params.missRecordCount = NUM_MISS_PROGRAM_GROUPS;
  sbt_params.hitgroupRecordBase = sbt_data_ptr + HIT_PROGAM_GROUP_OFFSET * sizeof(SbtRecord);
  sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
  sbt_params.hitgroupRecordCount = NUM_HIT_PROGRAM_GROUPS;
@@ -120,6 +172,12 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel,
  sbt_params.callablesRecordCount = NUM_CALLABLE_PROGRAM_GROUPS;
  sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);

+#  ifdef WITH_OSL
+  if (use_osl) {
+    sbt_params.callablesRecordCount += static_cast<unsigned int>(optix_device->osl_groups.size());
+  }
+#  endif
+
  /* Launch the ray generation program. */
  optix_device_assert(optix_device,
                      optixLaunch(pipeline,
--- a/intern/cycles/integrator/CMakeLists.txt
+++ b/intern/cycles/integrator/CMakeLists.txt
@@ -8,7 +8,7 @@ set(INC
 set(SRC
  adaptive_sampling.cpp
  denoiser.cpp
-  denoiser_device.cpp
+  denoiser_gpu.cpp
  denoiser_oidn.cpp
  denoiser_optix.cpp
  path_trace.cpp
@@ -30,7 +30,7 @@ set(SRC
 set(SRC_HEADERS
  adaptive_sampling.h
  denoiser.h
-  denoiser_device.h
+  denoiser_gpu.h
  denoiser_oidn.h
  denoiser_optix.h
  path_trace.h
--- a/intern/cycles/integrator/denoiser.cpp
+++ b/intern/cycles/integrator/denoiser.cpp
@@ -16,9 +16,11 @@ unique_ptr<Denoiser> Denoiser::create(Device *path_trace_device, const DenoisePa
 {
  DCHECK(params.use);

+#ifdef WITH_OPTIX
  if (params.type == DENOISER_OPTIX && Device::available_devices(DEVICE_MASK_OPTIX).size()) {
    return make_unique<OptiXDenoiser>(path_trace_device, params);
  }
+#endif

  /* Always fallback to OIDN. */
  DenoiseParams oidn_params = params;
--- a/intern/cycles/integrator/denoiser_device.h
+++ b/intern/cycles/integrator/denoiser_device.h
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#pragma once
-
-#include "integrator/denoiser.h"
-#include "util/unique_ptr.h"
-
-CCL_NAMESPACE_BEGIN
-
-/* Denoiser which uses device-specific denoising implementation, such as OptiX denoiser which are
- * implemented as a part of a driver of specific device.
- *
- * This implementation makes sure the to-be-denoised buffer is available on the denoising device
- * and invoke denoising kernel via device API. */
-class DeviceDenoiser : public Denoiser {
- public:
-  DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params);
-  ~DeviceDenoiser();
-
-  virtual bool denoise_buffer(const BufferParams &buffer_params,
-                              RenderBuffers *render_buffers,
-                              const int num_samples,
-                              bool allow_inplace_modification) override;
-};
-
-CCL_NAMESPACE_END
--- a/intern/cycles/integrator/denoiser_device.cpp
+++ b/intern/cycles/integrator/denoiser_device.cpp
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#include "integrator/denoiser_device.h"
+#include "integrator/denoiser_gpu.h"

 #include "device/denoise.h"
 #include "device/device.h"
@@ -13,27 +13,27 @@

 CCL_NAMESPACE_BEGIN

-DeviceDenoiser::DeviceDenoiser(Device *path_trace_device, const DenoiseParams &params)
+DenoiserGPU::DenoiserGPU(Device *path_trace_device, const DenoiseParams &params)
    : Denoiser(path_trace_device, params)
 {
 }

-DeviceDenoiser::~DeviceDenoiser()
+DenoiserGPU::~DenoiserGPU()
 {
  /* Explicit implementation, to allow forward declaration of Device in the header. */
 }

-bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
-                                    RenderBuffers *render_buffers,
-                                    const int num_samples,
-                                    bool allow_inplace_modification)
+bool DenoiserGPU::denoise_buffer(const BufferParams &buffer_params,
+                                 RenderBuffers *render_buffers,
+                                 const int num_samples,
+                                 bool allow_inplace_modification)
 {
  Device *denoiser_device = get_denoiser_device();
  if (!denoiser_device) {
    return false;
  }

-  DeviceDenoiseTask task;
+  DenoiseTask task;
  task.params = params_;
  task.num_samples = num_samples;
  task.buffer_params = buffer_params;
@@ -50,8 +50,6 @@ bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
  else {
    VLOG_WORK << "Creating temporary buffer on denoiser device.";

-    DeviceQueue *queue = denoiser_device->get_denoise_queue();
-
    /* Create buffer which is available by the device used by denoiser. */

    /* TODO(sergey): Optimize data transfers. For example, only copy denoising related passes,
@@ -70,13 +68,13 @@ bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
           render_buffers->buffer.data(),
           sizeof(float) * local_render_buffers.buffer.size());

-    queue->copy_to_device(local_render_buffers.buffer);
+    denoiser_queue_->copy_to_device(local_render_buffers.buffer);

    task.render_buffers = &local_render_buffers;
    task.allow_inplace_modification = true;
  }

-  const bool denoise_result = denoiser_device->denoise_buffer(task);
+  const bool denoise_result = denoise_buffer(task);

  if (local_buffer_used) {
    local_render_buffers.copy_from_device();
@@ -90,4 +88,21 @@ bool DeviceDenoiser::denoise_buffer(const BufferParams &buffer_params,
  return denoise_result;
 }

+Device *DenoiserGPU::ensure_denoiser_device(Progress *progress)
+{
+  Device *denoiser_device = Denoiser::ensure_denoiser_device(progress);
+  if (!denoiser_device) {
+    return nullptr;
+  }
+
+  if (!denoiser_queue_) {
+    denoiser_queue_ = denoiser_device->gpu_queue_create();
+    if (!denoiser_queue_) {
+      return nullptr;
+    }
+  }
+
+  return denoiser_device;
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/integrator/denoiser_gpu.h
+++ b/intern/cycles/integrator/denoiser_gpu.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#pragma once
+
+#include "integrator/denoiser.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Implementation of Denoiser which uses a device-specific denoising implementation, running on a
+ * GPU device queue. It makes sure the to-be-denoised buffer is available on the denoising device
+ * and invokes denoising kernels via the device queue API. */
+class DenoiserGPU : public Denoiser {
+ public:
+  DenoiserGPU(Device *path_trace_device, const DenoiseParams &params);
+  ~DenoiserGPU();
+
+  virtual bool denoise_buffer(const BufferParams &buffer_params,
+                              RenderBuffers *render_buffers,
+                              const int num_samples,
+                              bool allow_inplace_modification) override;
+
+ protected:
+  /* All the parameters needed to perform buffer denoising on a device.
+   * Is not really a task in its canonical terms (as in, is not an asynchronous running task). Is
+   * more like a wrapper for all the arguments and parameters needed to perform denoising. Is a
+   * single place where they are all listed, so that it's not required to modify all device methods
+   * when these parameters do change. */
+  class DenoiseTask {
+   public:
+    DenoiseParams params;
+
+    int num_samples;
+
+    RenderBuffers *render_buffers;
+    BufferParams buffer_params;
+
+    /* Allow to do in-place modification of the input passes (scaling them down i.e.). This will
+     * lower the memory footprint of the denoiser but will make input passes "invalid" (from path
+     * tracer) point of view. */
+    bool allow_inplace_modification;
+  };
+
+  /* Returns true if task is fully handled. */
+  virtual bool denoise_buffer(const DenoiseTask & /*task*/) = 0;
+
+  virtual Device *ensure_denoiser_device(Progress *progress) override;
+
+  unique_ptr<DeviceQueue> denoiser_queue_;
+};
+
+CCL_NAMESPACE_END
--- a/intern/cycles/integrator/denoiser_optix.cpp
+++ b/intern/cycles/integrator/denoiser_optix.cpp
@@ -1,16 +1,216 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#include "integrator/denoiser_optix.h"
+#ifdef WITH_OPTIX

-#include "device/denoise.h"
-#include "device/device.h"
+#  include "integrator/denoiser_optix.h"
+#  include "integrator/pass_accessor_gpu.h"
+
+#  include "device/optix/device_impl.h"
+#  include "device/optix/queue.h"
+
+#  include <optix_denoiser_tiling.h>

 CCL_NAMESPACE_BEGIN

-OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
-    : DeviceDenoiser(path_trace_device, params)
+#  if OPTIX_ABI_VERSION >= 60
+using ::optixUtilDenoiserInvokeTiled;
+#  else
+// A minimal copy of functionality `optix_denoiser_tiling.h` which allows to fix integer overflow
+// issues without bumping SDK or driver requirement.
+//
+// The original code is Copyright NVIDIA Corporation, BSD-3-Clause.
+static OptixResult optixUtilDenoiserSplitImage(const OptixImage2D &input,
+                                               const OptixImage2D &output,
+                                               unsigned int overlapWindowSizeInPixels,
+                                               unsigned int tileWidth,
+                                               unsigned int tileHeight,
+                                               std::vector<OptixUtilDenoiserImageTile> &tiles)
 {
+  if (tileWidth == 0 || tileHeight == 0)
+    return OPTIX_ERROR_INVALID_VALUE;
+
+  unsigned int inPixelStride = optixUtilGetPixelStride(input);
+  unsigned int outPixelStride = optixUtilGetPixelStride(output);
+
+  int inp_w = std::min(tileWidth + 2 * overlapWindowSizeInPixels, input.width);
+  int inp_h = std::min(tileHeight + 2 * overlapWindowSizeInPixels, input.height);
+  int inp_y = 0, copied_y = 0;
+
+  do {
+    int inputOffsetY = inp_y == 0 ? 0 :
+                                    std::max((int)overlapWindowSizeInPixels,
+                                             inp_h - ((int)input.height - inp_y));
+    int copy_y = inp_y == 0 ? std::min(input.height, tileHeight + overlapWindowSizeInPixels) :
+                              std::min(tileHeight, input.height - copied_y);
+
+    int inp_x = 0, copied_x = 0;
+    do {
+      int inputOffsetX = inp_x == 0 ? 0 :
+                                      std::max((int)overlapWindowSizeInPixels,
+                                               inp_w - ((int)input.width - inp_x));
+      int copy_x = inp_x == 0 ? std::min(input.width, tileWidth + overlapWindowSizeInPixels) :
+                                std::min(tileWidth, input.width - copied_x);
+
+      OptixUtilDenoiserImageTile tile;
+      tile.input.data = input.data + (size_t)(inp_y - inputOffsetY) * input.rowStrideInBytes +
+                        +(size_t)(inp_x - inputOffsetX) * inPixelStride;
+      tile.input.width = inp_w;
+      tile.input.height = inp_h;
+      tile.input.rowStrideInBytes = input.rowStrideInBytes;
+      tile.input.pixelStrideInBytes = input.pixelStrideInBytes;
+      tile.input.format = input.format;
+
+      tile.output.data = output.data + (size_t)inp_y * output.rowStrideInBytes +
+                         (size_t)inp_x * outPixelStride;
+      tile.output.width = copy_x;
+      tile.output.height = copy_y;
+      tile.output.rowStrideInBytes = output.rowStrideInBytes;
+      tile.output.pixelStrideInBytes = output.pixelStrideInBytes;
+      tile.output.format = output.format;
+
+      tile.inputOffsetX = inputOffsetX;
+      tile.inputOffsetY = inputOffsetY;
+      tiles.push_back(tile);
+
+      inp_x += inp_x == 0 ? tileWidth + overlapWindowSizeInPixels : tileWidth;
+      copied_x += copy_x;
+    } while (inp_x < static_cast<int>(input.width));
+
+    inp_y += inp_y == 0 ? tileHeight + overlapWindowSizeInPixels : tileHeight;
+    copied_y += copy_y;
+  } while (inp_y < static_cast<int>(input.height));
+
+  return OPTIX_SUCCESS;
+}
+
+static OptixResult optixUtilDenoiserInvokeTiled(OptixDenoiser denoiser,
+                                                CUstream stream,
+                                                const OptixDenoiserParams *params,
+                                                CUdeviceptr denoiserState,
+                                                size_t denoiserStateSizeInBytes,
+                                                const OptixDenoiserGuideLayer *guideLayer,
+                                                const OptixDenoiserLayer *layers,
+                                                unsigned int numLayers,
+                                                CUdeviceptr scratch,
+                                                size_t scratchSizeInBytes,
+                                                unsigned int overlapWindowSizeInPixels,
+                                                unsigned int tileWidth,
+                                                unsigned int tileHeight)
+{
+  if (!guideLayer || !layers)
+    return OPTIX_ERROR_INVALID_VALUE;
+
+  std::vector<std::vector<OptixUtilDenoiserImageTile>> tiles(numLayers);
+  std::vector<std::vector<OptixUtilDenoiserImageTile>> prevTiles(numLayers);
+  for (unsigned int l = 0; l < numLayers; l++) {
+    if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(layers[l].input,
+                                                                 layers[l].output,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth,
+                                                                 tileHeight,
+                                                                 tiles[l]))
+      return res;
+
+    if (layers[l].previousOutput.data) {
+      OptixImage2D dummyOutput = layers[l].previousOutput;
+      if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(layers[l].previousOutput,
+                                                                   dummyOutput,
+                                                                   overlapWindowSizeInPixels,
+                                                                   tileWidth,
+                                                                   tileHeight,
+                                                                   prevTiles[l]))
+        return res;
+    }
+  }
+
+  std::vector<OptixUtilDenoiserImageTile> albedoTiles;
+  if (guideLayer->albedo.data) {
+    OptixImage2D dummyOutput = guideLayer->albedo;
+    if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(guideLayer->albedo,
+                                                                 dummyOutput,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth,
+                                                                 tileHeight,
+                                                                 albedoTiles))
+      return res;
+  }
+
+  std::vector<OptixUtilDenoiserImageTile> normalTiles;
+  if (guideLayer->normal.data) {
+    OptixImage2D dummyOutput = guideLayer->normal;
+    if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(guideLayer->normal,
+                                                                 dummyOutput,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth,
+                                                                 tileHeight,
+                                                                 normalTiles))
+      return res;
+  }
+  std::vector<OptixUtilDenoiserImageTile> flowTiles;
+  if (guideLayer->flow.data) {
+    OptixImage2D dummyOutput = guideLayer->flow;
+    if (const OptixResult res = ccl::optixUtilDenoiserSplitImage(guideLayer->flow,
+                                                                 dummyOutput,
+                                                                 overlapWindowSizeInPixels,
+                                                                 tileWidth,
+                                                                 tileHeight,
+                                                                 flowTiles))
+      return res;
+  }
+
+  for (size_t t = 0; t < tiles[0].size(); t++) {
+    std::vector<OptixDenoiserLayer> tlayers;
+    for (unsigned int l = 0; l < numLayers; l++) {
+      OptixDenoiserLayer layer = {};
+      layer.input = (tiles[l])[t].input;
+      layer.output = (tiles[l])[t].output;
+      if (layers[l].previousOutput.data)
+        layer.previousOutput = (prevTiles[l])[t].input;
+      tlayers.push_back(layer);
+    }
+
+    OptixDenoiserGuideLayer gl = {};
+    if (guideLayer->albedo.data)
+      gl.albedo = albedoTiles[t].input;
+
+    if (guideLayer->normal.data)
+      gl.normal = normalTiles[t].input;
+
+    if (guideLayer->flow.data)
+      gl.flow = flowTiles[t].input;
+
+    if (const OptixResult res = optixDenoiserInvoke(denoiser,
+                                                    stream,
+                                                    params,
+                                                    denoiserState,
+                                                    denoiserStateSizeInBytes,
+                                                    &gl,
+                                                    &tlayers[0],
+                                                    numLayers,
+                                                    (tiles[0])[t].inputOffsetX,
+                                                    (tiles[0])[t].inputOffsetY,
+                                                    scratch,
+                                                    scratchSizeInBytes))
+      return res;
+  }
+  return OPTIX_SUCCESS;
+}
+#  endif
+
+OptiXDenoiser::OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params)
+    : DenoiserGPU(path_trace_device, params), state_(path_trace_device, "__denoiser_state", true)
+{
+}
+
+OptiXDenoiser::~OptiXDenoiser()
+{
+  /* It is important that the OptixDenoiser handle is destroyed before the OptixDeviceContext
+   * handle, which is guaranteed since the local denoising device owning the OptiX device context
+   * is deleted as part of the Denoiser class destructor call after this. */
+  if (optix_denoiser_ != nullptr) {
+    optixDenoiserDestroy(optix_denoiser_);
+  }
 }

 uint OptiXDenoiser::get_device_type_mask() const
@@ -18,4 +218,569 @@ uint OptiXDenoiser::get_device_type_mask() const
  return DEVICE_MASK_OPTIX;
 }

+class OptiXDenoiser::DenoiseContext {
+ public:
+  explicit DenoiseContext(OptiXDevice *device, const DenoiseTask &task)
+      : denoise_params(task.params),
+        render_buffers(task.render_buffers),
+        buffer_params(task.buffer_params),
+        guiding_buffer(device, "denoiser guiding passes buffer", true),
+        num_samples(task.num_samples)
+  {
+    num_input_passes = 1;
+    if (denoise_params.use_pass_albedo) {
+      num_input_passes += 1;
+      use_pass_albedo = true;
+      pass_denoising_albedo = buffer_params.get_pass_offset(PASS_DENOISING_ALBEDO);
+      if (denoise_params.use_pass_normal) {
+        num_input_passes += 1;
+        use_pass_normal = true;
+        pass_denoising_normal = buffer_params.get_pass_offset(PASS_DENOISING_NORMAL);
+      }
+    }
+
+    if (denoise_params.temporally_stable) {
+      prev_output.device_pointer = render_buffers->buffer.device_pointer;
+
+      prev_output.offset = buffer_params.get_pass_offset(PASS_DENOISING_PREVIOUS);
+
+      prev_output.stride = buffer_params.stride;
+      prev_output.pass_stride = buffer_params.pass_stride;
+
+      num_input_passes += 1;
+      use_pass_motion = true;
+      pass_motion = buffer_params.get_pass_offset(PASS_MOTION);
+    }
+
+    use_guiding_passes = (num_input_passes - 1) > 0;
+
+    if (use_guiding_passes) {
+      if (task.allow_inplace_modification) {
+        guiding_params.device_pointer = render_buffers->buffer.device_pointer;
+
+        guiding_params.pass_albedo = pass_denoising_albedo;
+        guiding_params.pass_normal = pass_denoising_normal;
+        guiding_params.pass_flow = pass_motion;
+
+        guiding_params.stride = buffer_params.stride;
+        guiding_params.pass_stride = buffer_params.pass_stride;
+      }
+      else {
+        guiding_params.pass_stride = 0;
+        if (use_pass_albedo) {
+          guiding_params.pass_albedo = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+        if (use_pass_normal) {
+          guiding_params.pass_normal = guiding_params.pass_stride;
+          guiding_params.pass_stride += 3;
+        }
+        if (use_pass_motion) {
+          guiding_params.pass_flow = guiding_params.pass_stride;
+          guiding_params.pass_stride += 2;
+        }
+
+        guiding_params.stride = buffer_params.width;
+
+        guiding_buffer.alloc_to_device(buffer_params.width * buffer_params.height *
+                                       guiding_params.pass_stride);
+        guiding_params.device_pointer = guiding_buffer.device_pointer;
+      }
+    }
+
+    pass_sample_count = buffer_params.get_pass_offset(PASS_SAMPLE_COUNT);
+  }
+
+  const DenoiseParams &denoise_params;
+
+  RenderBuffers *render_buffers = nullptr;
+  const BufferParams &buffer_params;
+
+  /* Previous output. */
+  struct {
+    device_ptr device_pointer = 0;
+
+    int offset = PASS_UNUSED;
+
+    int stride = -1;
+    int pass_stride = -1;
+  } prev_output;
+
+  /* Device-side storage of the guiding passes. */
+  device_only_memory<float> guiding_buffer;
+
+  struct {
+    device_ptr device_pointer = 0;
+
+    /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+    int pass_albedo = PASS_UNUSED;
+    int pass_normal = PASS_UNUSED;
+    int pass_flow = PASS_UNUSED;
+
+    int stride = -1;
+    int pass_stride = -1;
+  } guiding_params;
+
+  /* Number of input passes. Including the color and extra auxiliary passes. */
+  int num_input_passes = 0;
+  bool use_guiding_passes = false;
+  bool use_pass_albedo = false;
+  bool use_pass_normal = false;
+  bool use_pass_motion = false;
+
+  int num_samples = 0;
+
+  int pass_sample_count = PASS_UNUSED;
+
+  /* NOTE: Are only initialized when the corresponding guiding pass is enabled. */
+  int pass_denoising_albedo = PASS_UNUSED;
+  int pass_denoising_normal = PASS_UNUSED;
+  int pass_motion = PASS_UNUSED;
+
+  /* For passes which don't need albedo channel for denoising we replace the actual albedo with
+   * the (0.5, 0.5, 0.5). This flag indicates that the real albedo pass has been replaced with
+   * the fake values and denoising of passes which do need albedo can no longer happen. */
+  bool albedo_replaced_with_fake = false;
+};
+
+class OptiXDenoiser::DenoisePass {
+ public:
+  DenoisePass(const PassType type, const BufferParams &buffer_params) : type(type)
+  {
+    noisy_offset = buffer_params.get_pass_offset(type, PassMode::NOISY);
+    denoised_offset = buffer_params.get_pass_offset(type, PassMode::DENOISED);
+
+    const PassInfo pass_info = Pass::get_info(type);
+    num_components = pass_info.num_components;
+    use_compositing = pass_info.use_compositing;
+    use_denoising_albedo = pass_info.use_denoising_albedo;
+  }
+
+  PassType type;
+
+  int noisy_offset;
+  int denoised_offset;
+
+  int num_components;
+  bool use_compositing;
+  bool use_denoising_albedo;
+};
+
+bool OptiXDenoiser::denoise_buffer(const DenoiseTask &task)
+{
+  OptiXDevice *const optix_device = static_cast<OptiXDevice *>(denoiser_device_);
+
+  const CUDAContextScope scope(optix_device);
+
+  DenoiseContext context(optix_device, task);
+
+  if (!denoise_ensure(context)) {
+    return false;
+  }
+
+  if (!denoise_filter_guiding_preprocess(context)) {
+    LOG(ERROR) << "Error preprocessing guiding passes.";
+    return false;
+  }
+
+  /* Passes which will use real albedo when it is available. */
+  denoise_pass(context, PASS_COMBINED);
+  denoise_pass(context, PASS_SHADOW_CATCHER_MATTE);
+
+  /* Passes which do not need albedo and hence if real is present it needs to become fake. */
+  denoise_pass(context, PASS_SHADOW_CATCHER);
+
+  return true;
+}
+
+bool OptiXDenoiser::denoise_filter_guiding_preprocess(const DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  DeviceKernelArguments args(&context.guiding_params.device_pointer,
+                             &context.guiding_params.pass_stride,
+                             &context.guiding_params.pass_albedo,
+                             &context.guiding_params.pass_normal,
+                             &context.guiding_params.pass_flow,
+                             &context.render_buffers->buffer.device_pointer,
+                             &buffer_params.offset,
+                             &buffer_params.stride,
+                             &buffer_params.pass_stride,
+                             &context.pass_sample_count,
+                             &context.pass_denoising_albedo,
+                             &context.pass_denoising_normal,
+                             &context.pass_motion,
+                             &buffer_params.full_x,
+                             &buffer_params.full_y,
+                             &buffer_params.width,
+                             &buffer_params.height,
+                             &context.num_samples);
+
+  return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS, work_size, args);
+}
+
+bool OptiXDenoiser::denoise_filter_guiding_set_fake_albedo(const DenoiseContext &context)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  DeviceKernelArguments args(&context.guiding_params.device_pointer,
+                             &context.guiding_params.pass_stride,
+                             &context.guiding_params.pass_albedo,
+                             &buffer_params.width,
+                             &buffer_params.height);
+
+  return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO, work_size, args);
+}
+
+void OptiXDenoiser::denoise_pass(DenoiseContext &context, PassType pass_type)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const DenoisePass pass(pass_type, buffer_params);
+
+  if (pass.noisy_offset == PASS_UNUSED) {
+    return;
+  }
+  if (pass.denoised_offset == PASS_UNUSED) {
+    LOG(DFATAL) << "Missing denoised pass " << pass_type_as_string(pass_type);
+    return;
+  }
+
+  if (pass.use_denoising_albedo) {
+    if (context.albedo_replaced_with_fake) {
+      LOG(ERROR) << "Pass which requires albedo is denoised after fake albedo has been set.";
+      return;
+    }
+  }
+  else if (context.use_guiding_passes && !context.albedo_replaced_with_fake) {
+    context.albedo_replaced_with_fake = true;
+    if (!denoise_filter_guiding_set_fake_albedo(context)) {
+      LOG(ERROR) << "Error replacing real albedo with the fake one.";
+      return;
+    }
+  }
+
+  /* Read and preprocess noisy color input pass. */
+  denoise_color_read(context, pass);
+  if (!denoise_filter_color_preprocess(context, pass)) {
+    LOG(ERROR) << "Error converting denoising passes to RGB buffer.";
+    return;
+  }
+
+  if (!denoise_run(context, pass)) {
+    LOG(ERROR) << "Error running OptiX denoiser.";
+    return;
+  }
+
+  /* Store result in the combined pass of the render buffer.
+   *
+   * This will scale the denoiser result up to match the number of, possibly per-pixel, samples. */
+  if (!denoise_filter_color_postprocess(context, pass)) {
+    LOG(ERROR) << "Error copying denoiser result to the denoised pass.";
+    return;
+  }
+
+  denoiser_queue_->synchronize();
+}
+
+void OptiXDenoiser::denoise_color_read(const DenoiseContext &context, const DenoisePass &pass)
+{
+  PassAccessor::PassAccessInfo pass_access_info;
+  pass_access_info.type = pass.type;
+  pass_access_info.mode = PassMode::NOISY;
+  pass_access_info.offset = pass.noisy_offset;
+
+  /* Denoiser operates on passes which are used to calculate the approximation, and is never used
+   * on the approximation. The latter is not even possible because OptiX does not support
+   * denoising of semi-transparent pixels. */
+  pass_access_info.use_approximate_shadow_catcher = false;
+  pass_access_info.use_approximate_shadow_catcher_background = false;
+  pass_access_info.show_active_pixels = false;
+
+  /* TODO(sergey): Consider adding support of actual exposure, to avoid clamping in extreme cases.
+   */
+  const PassAccessorGPU pass_accessor(
+      denoiser_queue_.get(), pass_access_info, 1.0f, context.num_samples);
+
+  PassAccessor::Destination destination(pass_access_info.type);
+  destination.d_pixels = context.render_buffers->buffer.device_pointer +
+                         pass.denoised_offset * sizeof(float);
+  destination.num_components = 3;
+  destination.pixel_stride = context.buffer_params.pass_stride;
+
+  BufferParams buffer_params = context.buffer_params;
+  buffer_params.window_x = 0;
+  buffer_params.window_y = 0;
+  buffer_params.window_width = buffer_params.width;
+  buffer_params.window_height = buffer_params.height;
+
+  pass_accessor.get_render_tile_pixels(context.render_buffers, buffer_params, destination);
+}
+
+bool OptiXDenoiser::denoise_filter_color_preprocess(const DenoiseContext &context,
+                                                    const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer,
+                             &buffer_params.full_x,
+                             &buffer_params.full_y,
+                             &buffer_params.width,
+                             &buffer_params.height,
+                             &buffer_params.offset,
+                             &buffer_params.stride,
+                             &buffer_params.pass_stride,
+                             &pass.denoised_offset);
+
+  return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_COLOR_PREPROCESS, work_size, args);
+}
+
+bool OptiXDenoiser::denoise_filter_color_postprocess(const DenoiseContext &context,
+                                                     const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+
+  const int work_size = buffer_params.width * buffer_params.height;
+
+  DeviceKernelArguments args(&context.render_buffers->buffer.device_pointer,
+                             &buffer_params.full_x,
+                             &buffer_params.full_y,
+                             &buffer_params.width,
+                             &buffer_params.height,
+                             &buffer_params.offset,
+                             &buffer_params.stride,
+                             &buffer_params.pass_stride,
+                             &context.num_samples,
+                             &pass.noisy_offset,
+                             &pass.denoised_offset,
+                             &context.pass_sample_count,
+                             &pass.num_components,
+                             &pass.use_compositing);
+
+  return denoiser_queue_->enqueue(DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS, work_size, args);
+}
+
+bool OptiXDenoiser::denoise_ensure(DenoiseContext &context)
+{
+  if (!denoise_create_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser creation has failed.";
+    return false;
+  }
+
+  if (!denoise_configure_if_needed(context)) {
+    LOG(ERROR) << "OptiX denoiser configuration has failed.";
+    return false;
+  }
+
+  return true;
+}
+
+bool OptiXDenoiser::denoise_create_if_needed(DenoiseContext &context)
+{
+  const bool recreate_denoiser = (optix_denoiser_ == nullptr) ||
+                                 (use_pass_albedo_ != context.use_pass_albedo) ||
+                                 (use_pass_normal_ != context.use_pass_normal) ||
+                                 (use_pass_motion_ != context.use_pass_motion);
+  if (!recreate_denoiser) {
+    return true;
+  }
+
+  /* Destroy existing handle before creating new one. */
+  if (optix_denoiser_) {
+    optixDenoiserDestroy(optix_denoiser_);
+  }
+
+  /* Create OptiX denoiser handle on demand when it is first used. */
+  OptixDenoiserOptions denoiser_options = {};
+  denoiser_options.guideAlbedo = context.use_pass_albedo;
+  denoiser_options.guideNormal = context.use_pass_normal;
+
+  OptixDenoiserModelKind model = OPTIX_DENOISER_MODEL_KIND_HDR;
+  if (context.use_pass_motion) {
+    model = OPTIX_DENOISER_MODEL_KIND_TEMPORAL;
+  }
+
+  const OptixResult result = optixDenoiserCreate(
+      static_cast<OptiXDevice *>(denoiser_device_)->context,
+      model,
+      &denoiser_options,
+      &optix_denoiser_);
+
+  if (result != OPTIX_SUCCESS) {
+    denoiser_device_->set_error("Failed to create OptiX denoiser");
+    return false;
+  }
+
+  /* OptiX denoiser handle was created with the requested number of input passes. */
+  use_pass_albedo_ = context.use_pass_albedo;
+  use_pass_normal_ = context.use_pass_normal;
+  use_pass_motion_ = context.use_pass_motion;
+
+  /* OptiX denoiser has been created, but it needs configuration. */
+  is_configured_ = false;
+
+  return true;
+}
+
+bool OptiXDenoiser::denoise_configure_if_needed(DenoiseContext &context)
+{
+  /* Limit maximum tile size denoiser can be invoked with. */
+  const int2 tile_size = make_int2(min(context.buffer_params.width, 4096),
+                                   min(context.buffer_params.height, 4096));
+
+  if (is_configured_ && (configured_size_.x == tile_size.x && configured_size_.y == tile_size.y)) {
+    return true;
+  }
+
+  optix_device_assert(
+      denoiser_device_,
+      optixDenoiserComputeMemoryResources(optix_denoiser_, tile_size.x, tile_size.y, &sizes_));
+
+  /* Allocate denoiser state if tile size has changed since last setup. */
+  state_.device = denoiser_device_;
+  state_.alloc_to_device(sizes_.stateSizeInBytes + sizes_.withOverlapScratchSizeInBytes);
+
+  /* Initialize denoiser state for the current tile size. */
+  const OptixResult result = optixDenoiserSetup(
+      optix_denoiser_,
+      0, /* Work around bug in r495 drivers that causes artifacts when denoiser setup is called
+          * on a stream that is not the default stream. */
+      tile_size.x + sizes_.overlapWindowSizeInPixels * 2,
+      tile_size.y + sizes_.overlapWindowSizeInPixels * 2,
+      state_.device_pointer,
+      sizes_.stateSizeInBytes,
+      state_.device_pointer + sizes_.stateSizeInBytes,
+      sizes_.withOverlapScratchSizeInBytes);
+  if (result != OPTIX_SUCCESS) {
+    denoiser_device_->set_error("Failed to set up OptiX denoiser");
+    return false;
+  }
+
+  cuda_device_assert(denoiser_device_, cuCtxSynchronize());
+
+  is_configured_ = true;
+  configured_size_ = tile_size;
+
+  return true;
+}
+
+bool OptiXDenoiser::denoise_run(const DenoiseContext &context, const DenoisePass &pass)
+{
+  const BufferParams &buffer_params = context.buffer_params;
+  const int width = buffer_params.width;
+  const int height = buffer_params.height;
+
+  /* Set up input and output layer information. */
+  OptixImage2D color_layer = {0};
+  OptixImage2D albedo_layer = {0};
+  OptixImage2D normal_layer = {0};
+  OptixImage2D flow_layer = {0};
+
+  OptixImage2D output_layer = {0};
+  OptixImage2D prev_output_layer = {0};
+
+  /* Color pass. */
+  {
+    const int pass_denoised = pass.denoised_offset;
+    const int64_t pass_stride_in_bytes = context.buffer_params.pass_stride * sizeof(float);
+
+    color_layer.data = context.render_buffers->buffer.device_pointer +
+                       pass_denoised * sizeof(float);
+    color_layer.width = width;
+    color_layer.height = height;
+    color_layer.rowStrideInBytes = pass_stride_in_bytes * context.buffer_params.stride;
+    color_layer.pixelStrideInBytes = pass_stride_in_bytes;
+    color_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+  }
+
+  /* Previous output. */
+  if (context.prev_output.offset != PASS_UNUSED) {
+    const int64_t pass_stride_in_bytes = context.prev_output.pass_stride * sizeof(float);
+
+    prev_output_layer.data = context.prev_output.device_pointer +
+                             context.prev_output.offset * sizeof(float);
+    prev_output_layer.width = width;
+    prev_output_layer.height = height;
+    prev_output_layer.rowStrideInBytes = pass_stride_in_bytes * context.prev_output.stride;
+    prev_output_layer.pixelStrideInBytes = pass_stride_in_bytes;
+    prev_output_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+  }
+
+  /* Optional albedo and color passes. */
+  if (context.num_input_passes > 1) {
+    const device_ptr d_guiding_buffer = context.guiding_params.device_pointer;
+    const int64_t pixel_stride_in_bytes = context.guiding_params.pass_stride * sizeof(float);
+    const int64_t row_stride_in_bytes = context.guiding_params.stride * pixel_stride_in_bytes;
+
+    if (context.use_pass_albedo) {
+      albedo_layer.data = d_guiding_buffer + context.guiding_params.pass_albedo * sizeof(float);
+      albedo_layer.width = width;
+      albedo_layer.height = height;
+      albedo_layer.rowStrideInBytes = row_stride_in_bytes;
+      albedo_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      albedo_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+
+    if (context.use_pass_normal) {
+      normal_layer.data = d_guiding_buffer + context.guiding_params.pass_normal * sizeof(float);
+      normal_layer.width = width;
+      normal_layer.height = height;
+      normal_layer.rowStrideInBytes = row_stride_in_bytes;
+      normal_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      normal_layer.format = OPTIX_PIXEL_FORMAT_FLOAT3;
+    }
+
+    if (context.use_pass_motion) {
+      flow_layer.data = d_guiding_buffer + context.guiding_params.pass_flow * sizeof(float);
+      flow_layer.width = width;
+      flow_layer.height = height;
+      flow_layer.rowStrideInBytes = row_stride_in_bytes;
+      flow_layer.pixelStrideInBytes = pixel_stride_in_bytes;
+      flow_layer.format = OPTIX_PIXEL_FORMAT_FLOAT2;
+    }
+  }
+
+  /* Denoise in-place of the noisy input in the render buffers. */
+  output_layer = color_layer;
+
+  OptixDenoiserGuideLayer guide_layers = {};
+  guide_layers.albedo = albedo_layer;
+  guide_layers.normal = normal_layer;
+  guide_layers.flow = flow_layer;
+
+  OptixDenoiserLayer image_layers = {};
+  image_layers.input = color_layer;
+  image_layers.previousOutput = prev_output_layer;
+  image_layers.output = output_layer;
+
+  /* Finally run denoising. */
+  OptixDenoiserParams params = {}; /* All parameters are disabled/zero. */
+
+  optix_device_assert(denoiser_device_,
+                      ccl::optixUtilDenoiserInvokeTiled(
+                          optix_denoiser_,
+                          static_cast<OptiXDeviceQueue *>(denoiser_queue_.get())->stream(),
+                          &params,
+                          state_.device_pointer,
+                          sizes_.stateSizeInBytes,
+                          &guide_layers,
+                          &image_layers,
+                          1,
+                          state_.device_pointer + sizes_.stateSizeInBytes,
+                          sizes_.withOverlapScratchSizeInBytes,
+                          sizes_.overlapWindowSizeInPixels,
+                          configured_size_.x,
+                          configured_size_.y));
+
+  return true;
+}
+
 CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/integrator/denoiser_optix.h
+++ b/intern/cycles/integrator/denoiser_optix.h
@@ -3,16 +3,84 @@

 #pragma once

-#include "integrator/denoiser_device.h"
+#ifdef WITH_OPTIX
+
+#  include "integrator/denoiser_gpu.h"
+
+#  include "device/optix/util.h"

 CCL_NAMESPACE_BEGIN

-class OptiXDenoiser : public DeviceDenoiser {
+/* Implementation of denoising API which uses the OptiX denoiser. */
+class OptiXDenoiser : public DenoiserGPU {
 public:
  OptiXDenoiser(Device *path_trace_device, const DenoiseParams &params);
+  ~OptiXDenoiser();

 protected:
  virtual uint get_device_type_mask() const override;
+
+ private:
+  class DenoiseContext;
+  class DenoisePass;
+
+  virtual bool denoise_buffer(const DenoiseTask &task) override;
+
+  /* Read guiding passes from the render buffers, preprocess them in a way which is expected by
+   * OptiX and store in the guiding passes memory within the given context.
+   *
+   * Pre-processing of the guiding passes is to only happen once per context lifetime. DO not
+   * preprocess them for every pass which is being denoised. */
+  bool denoise_filter_guiding_preprocess(const DenoiseContext &context);
+
+  /* Set fake albedo pixels in the albedo guiding pass storage.
+   * After this point only passes which do not need albedo for denoising can be processed. */
+  bool denoise_filter_guiding_set_fake_albedo(const DenoiseContext &context);
+
+  void denoise_pass(DenoiseContext &context, PassType pass_type);
+
+  /* Read input color pass from the render buffer into the memory which corresponds to the noisy
+   * input within the given context. Pixels are scaled to the number of samples, but are not
+   * preprocessed yet. */
+  void denoise_color_read(const DenoiseContext &context, const DenoisePass &pass);
+
+  /* Run corresponding filter kernels, preparing data for the denoiser or copying data from the
+   * denoiser result to the render buffer. */
+  bool denoise_filter_color_preprocess(const DenoiseContext &context, const DenoisePass &pass);
+  bool denoise_filter_color_postprocess(const DenoiseContext &context, const DenoisePass &pass);
+
+  /* Make sure the OptiX denoiser is created and configured. */
+  bool denoise_ensure(DenoiseContext &context);
+
+  /* Create OptiX denoiser descriptor if needed.
+   * Will do nothing if the current OptiX descriptor is usable for the given parameters.
+   * If the OptiX denoiser descriptor did re-allocate here it is left unconfigured. */
+  bool denoise_create_if_needed(DenoiseContext &context);
+
+  /* Configure existing OptiX denoiser descriptor for the use for the given task. */
+  bool denoise_configure_if_needed(DenoiseContext &context);
+
+  /* Run configured denoiser. */
+  bool denoise_run(const DenoiseContext &context, const DenoisePass &pass);
+
+  OptixDenoiser optix_denoiser_ = nullptr;
+
+  /* Configuration size, as provided to `optixDenoiserSetup`.
+   * If the `optixDenoiserSetup()` was never used on the current `optix_denoiser` the
+   * `is_configured` will be false. */
+  bool is_configured_ = false;
+  int2 configured_size_ = make_int2(0, 0);
+
+  /* OptiX denoiser state and scratch buffers, stored in a single memory buffer.
+   * The memory layout goes as following: [denoiser state][scratch buffer]. */
+  device_only_memory<unsigned char> state_;
+  OptixDenoiserSizes sizes_ = {};
+
+  bool use_pass_albedo_ = false;
+  bool use_pass_normal_ = false;
+  bool use_pass_motion_ = false;
 };

 CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -37,6 +37,14 @@ set(SRC_KERNEL_DEVICE_OPTIX
  device/optix/kernel_shader_raytrace.cu
 )

+if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+  set(SRC_KERNEL_DEVICE_OPTIX
+    ${SRC_KERNEL_DEVICE_OPTIX}
+    osl/services_optix.cu
+    device/optix/kernel_osl.cu
+  )
+endif()
+
 set(SRC_KERNEL_DEVICE_ONEAPI
  device/oneapi/kernel.cpp
 )
@@ -181,6 +189,16 @@ set(SRC_KERNEL_SVM_HEADERS
  svm/vertex_color.h
 )

+if(WITH_CYCLES_OSL)
+  set(SRC_KERNEL_OSL_HEADERS
+    osl/osl.h
+    osl/closures_setup.h
+    osl/closures_template.h
+    osl/services_gpu.h
+    osl/types.h
+  )
+endif()
+
 set(SRC_KERNEL_GEOM_HEADERS
  geom/geom.h
  geom/attribute.h
@@ -306,6 +324,7 @@ set(SRC_KERNEL_HEADERS
  ${SRC_KERNEL_GEOM_HEADERS}
  ${SRC_KERNEL_INTEGRATOR_HEADERS}
  ${SRC_KERNEL_LIGHT_HEADERS}
+  ${SRC_KERNEL_OSL_HEADERS}
  ${SRC_KERNEL_SAMPLE_HEADERS}
  ${SRC_KERNEL_SVM_HEADERS}
  ${SRC_KERNEL_TYPES_HEADERS}
@@ -328,6 +347,7 @@ set(SRC_UTIL_HEADERS
  ../util/math_int2.h
  ../util/math_int3.h
  ../util/math_int4.h
+  ../util/math_int8.h
  ../util/math_matrix.h
  ../util/projection.h
  ../util/rect.h
@@ -350,6 +370,8 @@ set(SRC_UTIL_HEADERS
  ../util/types_int3_impl.h
  ../util/types_int4.h
  ../util/types_int4_impl.h
+  ../util/types_int8.h
+  ../util/types_int8_impl.h
  ../util/types_spectrum.h
  ../util/types_uchar2.h
  ../util/types_uchar2_impl.h
@@ -660,6 +682,16 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
    kernel_optix_shader_raytrace
    "device/optix/kernel_shader_raytrace.cu"
    "--keep-device-functions")
+  if(WITH_CYCLES_OSL AND (OSL_LIBRARY_VERSION_MINOR GREATER_EQUAL 13 OR OSL_LIBRARY_VERSION_MAJOR GREATER 1))
+    CYCLES_OPTIX_KERNEL_ADD(
+      kernel_optix_osl
+      "device/optix/kernel_osl.cu"
+      "--relocatable-device-code=true")
+    CYCLES_OPTIX_KERNEL_ADD(
+      kernel_optix_osl_services
+      "osl/services_optix.cu"
+      "--relocatable-device-code=true")
+  endif()

  add_custom_target(cycles_kernel_optix ALL DEPENDS ${optix_ptx})
  cycles_set_solution_folder(cycles_kernel_optix)
@@ -947,6 +979,7 @@ source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_KERNEL_TYPES_HEADERS})
 source_group("light" FILES ${SRC_KERNEL_LIGHT_HEADERS})
+source_group("osl" FILES ${SRC_KERNEL_OSL_HEADERS})
 source_group("sample" FILES ${SRC_KERNEL_SAMPLE_HEADERS})
 source_group("svm" FILES ${SRC_KERNEL_SVM_HEADERS})
 source_group("util" FILES ${SRC_KERNEL_UTIL_HEADERS})
@@ -983,6 +1016,7 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLE
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_LIGHT_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/light)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_OSL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/osl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SAMPLE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/sample)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_TYPES_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -297,8 +297,10 @@ ccl_device_inline void bsdf_roughness_eta(const KernelGlobals kg,
                                          ccl_private float2 *roughness,
                                          ccl_private float *eta)
 {
+#ifdef __SVM__
  bool refractive = false;
  float alpha = 1.0f;
+#endif
  switch (sc->type) {
    case CLOSURE_BSDF_DIFFUSE_ID:
      *roughness = one_float2();
@@ -578,11 +580,11 @@ ccl_device_inline
    case CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID:
    case CLOSURE_BSDF_MICROFACET_GGX_CLEARCOAT_ID:
    case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-      eval = bsdf_microfacet_ggx_eval(sc, sd->N, sd->I, omega_in, pdf);
+      eval = bsdf_microfacet_ggx_eval(sc, sd->I, omega_in, pdf);
      break;
    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID:
    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_FRESNEL_ID:
-      eval = bsdf_microfacet_multi_ggx_eval(sc, sd->N, sd->I, omega_in, pdf, &sd->lcg_state);
+      eval = bsdf_microfacet_multi_ggx_eval(sc, sd->I, omega_in, pdf, &sd->lcg_state);
      break;
    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID:
    case CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID:
@@ -590,10 +592,10 @@ ccl_device_inline
      break;
    case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
    case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-      eval = bsdf_microfacet_beckmann_eval(sc, sd->N, sd->I, omega_in, pdf);
+      eval = bsdf_microfacet_beckmann_eval(sc, sd->I, omega_in, pdf);
      break;
    case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
-      eval = bsdf_ashikhmin_shirley_eval(sc, sd->N, sd->I, omega_in, pdf);
+      eval = bsdf_ashikhmin_shirley_eval(sc, sd->I, omega_in, pdf);
      break;
    case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
      eval = bsdf_ashikhmin_velvet_eval(sc, sd->I, omega_in, pdf);
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -40,13 +40,11 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough
 }

 ccl_device_forceinline Spectrum bsdf_ashikhmin_shirley_eval(ccl_private const ShaderClosure *sc,
-                                                            const float3 Ng,
                                                            const float3 I,
                                                            const float3 omega_in,
                                                            ccl_private float *pdf)
 {
  ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
-  const float cosNgI = dot(Ng, omega_in);
  float3 N = bsdf->N;

  float NdotI = dot(N, I);        /* in Cycles/OSL convention I is omega_out */
@@ -54,8 +52,7 @@ ccl_device_forceinline Spectrum bsdf_ashikhmin_shirley_eval(ccl_private const Sh

  float out = 0.0f;

-  if ((cosNgI < 0.0f) || fmaxf(bsdf->alpha_x, bsdf->alpha_y) <= 1e-4f ||
-      !(NdotI > 0.0f && NdotO > 0.0f)) {
+  if (fmaxf(bsdf->alpha_x, bsdf->alpha_y) <= 1e-4f || !(NdotI > 0.0f && NdotO > 0.0f)) {
    *pdf = 0.0f;
    return zero_spectrum();
  }
@@ -213,7 +210,7 @@ ccl_device int bsdf_ashikhmin_shirley_sample(ccl_private const ShaderClosure *sc
  }
  else {
    /* leave the rest to eval */
-    *eval = bsdf_ashikhmin_shirley_eval(sc, N, I, *omega_in, pdf);
+    *eval = bsdf_ashikhmin_shirley_eval(sc, I, *omega_in, pdf);
  }

  return label;
--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -517,30 +517,27 @@ ccl_device Spectrum bsdf_microfacet_ggx_eval_transmit(ccl_private const Microfac
 }

 ccl_device Spectrum bsdf_microfacet_ggx_eval(ccl_private const ShaderClosure *sc,
-                                             const float3 Ng,
                                             const float3 I,
                                             const float3 omega_in,
                                             ccl_private float *pdf)
 {
  ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
-  const bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
  const float alpha_x = bsdf->alpha_x;
  const float alpha_y = bsdf->alpha_y;
-  const float cosNgI = dot(Ng, omega_in);
-
-  if (((cosNgI < 0.0f) != m_refractive) || alpha_x * alpha_y <= 1e-7f) {
-    *pdf = 0.0f;
-    return zero_spectrum();
-  }
-
+  const bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
  const float3 N = bsdf->N;
  const float cosNO = dot(N, I);
  const float cosNI = dot(N, omega_in);

-  return (cosNgI < 0.0f) ? bsdf_microfacet_ggx_eval_transmit(
-                               bsdf, N, I, omega_in, pdf, alpha_x, alpha_y, cosNO, cosNI) :
-                           bsdf_microfacet_ggx_eval_reflect(
-                               bsdf, N, I, omega_in, pdf, alpha_x, alpha_y, cosNO, cosNI);
+  if (((cosNI < 0.0f) != m_refractive) || alpha_x * alpha_y <= 1e-7f) {
+    *pdf = 0.0f;
+    return zero_spectrum();
+  }
+
+  return (cosNI < 0.0f) ? bsdf_microfacet_ggx_eval_transmit(
+                              bsdf, N, I, omega_in, pdf, alpha_x, alpha_y, cosNO, cosNI) :
+                          bsdf_microfacet_ggx_eval_reflect(
+                              bsdf, N, I, omega_in, pdf, alpha_x, alpha_y, cosNO, cosNI);
 }

 ccl_device int bsdf_microfacet_ggx_sample(KernelGlobals kg,
@@ -945,26 +942,23 @@ ccl_device Spectrum bsdf_microfacet_beckmann_eval_transmit(ccl_private const Mic
 }

 ccl_device Spectrum bsdf_microfacet_beckmann_eval(ccl_private const ShaderClosure *sc,
-                                                  const float3 Ng,
                                                  const float3 I,
                                                  const float3 omega_in,
                                                  ccl_private float *pdf)
 {
  ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
-  const bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
  const float alpha_x = bsdf->alpha_x;
  const float alpha_y = bsdf->alpha_y;
-  const float cosNgI = dot(Ng, omega_in);
-
-  if (((cosNgI < 0.0f) != m_refractive) || alpha_x * alpha_y <= 1e-7f) {
-    *pdf = 0.0f;
-    return zero_spectrum();
-  }
-
+  const bool m_refractive = bsdf->type == CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
  const float3 N = bsdf->N;
  const float cosNO = dot(N, I);
  const float cosNI = dot(N, omega_in);

+  if (((cosNI < 0.0f) != m_refractive) || alpha_x * alpha_y <= 1e-7f) {
+    *pdf = 0.0f;
+    return zero_spectrum();
+  }
+
  return (cosNI < 0.0f) ? bsdf_microfacet_beckmann_eval_transmit(
                              bsdf, N, I, omega_in, pdf, alpha_x, alpha_y, cosNO, cosNI) :
                          bsdf_microfacet_beckmann_eval_reflect(
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -416,16 +416,14 @@ ccl_device int bsdf_microfacet_multi_ggx_refraction_setup(ccl_private Microfacet
 }

 ccl_device Spectrum bsdf_microfacet_multi_ggx_eval(ccl_private const ShaderClosure *sc,
-                                                   const float3 Ng,
                                                   const float3 I,
                                                   const float3 omega_in,
                                                   ccl_private float *pdf,
                                                   ccl_private uint *lcg_state)
 {
  ccl_private const MicrofacetBsdf *bsdf = (ccl_private const MicrofacetBsdf *)sc;
-  const float cosNgI = dot(Ng, omega_in);

-  if ((cosNgI < 0.0f) || bsdf->alpha_x * bsdf->alpha_y < 1e-7f) {
+  if (bsdf->alpha_x * bsdf->alpha_y < 1e-7f) {
    *pdf = 0.0f;
    return zero_spectrum();
  }
--- a/intern/cycles/kernel/device/cpu/kernel.cpp
+++ b/intern/cycles/kernel/device/cpu/kernel.cpp
@@ -7,6 +7,7 @@
 * one with SSE2 intrinsics.
 */
 #if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE__
 #  define __KERNEL_SSE2__
 #endif

@@ -29,11 +30,15 @@
 #    define __KERNEL_SSE41__
 #  endif
 #  ifdef __AVX__
-#    define __KERNEL_SSE__
+#    ifndef __KERNEL_SSE__
+#      define __KERNEL_SSE__
+#    endif
 #    define __KERNEL_AVX__
 #  endif
 #  ifdef __AVX2__
-#    define __KERNEL_SSE__
+#    ifndef __KERNEL_SSE__
+#      define __KERNEL_SSE__
+#    endif
 #    define __KERNEL_AVX2__
 #  endif
 #endif
--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -30,6 +30,7 @@ typedef unsigned long long uint64_t;
 /* Qualifiers */

 #define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
 #if __CUDA_ARCH__ < 500
 #  define ccl_device_inline __device__ __forceinline__
 #  define ccl_device_forceinline __device__ __forceinline__
@@ -109,14 +110,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D

 typedef unsigned short half;

-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
 {
  half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
  return val;
 }

-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
 {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -28,6 +28,7 @@ typedef unsigned long long uint64_t;
 /* Qualifiers */

 #define ccl_device __device__ __inline__
+#define ccl_device_extern extern "C" __device__
 #define ccl_device_inline __device__ __inline__
 #define ccl_device_forceinline __device__ __forceinline__
 #define ccl_device_noinline __device__ __noinline__
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -38,6 +38,7 @@ using namespace metal::raytracing;
 #  define ccl_device_noinline ccl_device __attribute__((noinline))
 #endif

+#define ccl_device_extern extern "C"
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_device_inline_method ccl_device
 #define ccl_global device
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -28,6 +28,7 @@
 /* Qualifier wrappers for different names on different devices */

 #define ccl_device
+#define ccl_device_extern extern "C"
 #define ccl_global
 #define ccl_always_inline __attribute__((always_inline))
 #define ccl_device_inline inline
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -33,14 +33,16 @@ typedef unsigned long long uint64_t;
 #endif

 #define ccl_device \
-  __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+  static __device__ \
+      __forceinline__  // Function calls are bad for OptiX performance, so inline everything
+#define ccl_device_extern extern "C" __device__
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
-#define ccl_device_inline_method ccl_device
-#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_inline_method __device__ __forceinline__
+#define ccl_device_noinline static __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
-#define ccl_inline_constant __constant__
+#define ccl_inline_constant static __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -57,23 +59,6 @@ typedef unsigned long long uint64_t;

 #define kernel_assert(cond)

-/* GPU thread, block, grid size and index */
-
-#define ccl_gpu_thread_idx_x (threadIdx.x)
-#define ccl_gpu_block_dim_x (blockDim.x)
-#define ccl_gpu_block_idx_x (blockIdx.x)
-#define ccl_gpu_grid_dim_x (gridDim.x)
-#define ccl_gpu_warp_size (warpSize)
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-
-#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
-#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
-
-/* GPU warp synchronization. */
-
-#define ccl_gpu_syncthreads() __syncthreads()
-#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
-
 /* GPU texture objects */

 typedef unsigned long long CUtexObject;
@@ -101,14 +86,14 @@ ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object_3D

 typedef unsigned short half;

-__device__ half __float2half(const float f)
+ccl_device_forceinline half __float2half(const float f)
 {
  half val;
  asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
  return val;
 }

-__device__ float __half2float(const half h)
+ccl_device_forceinline float __half2float(const half h)
 {
  float val;
  asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(h));
--- a/intern/cycles/kernel/device/optix/globals.h
+++ b/intern/cycles/kernel/device/optix/globals.h
@@ -25,6 +25,7 @@ struct KernelParamsOptiX {
  /* Kernel arguments */
  const int *path_index_array;
  float *render_buffer;
+  int offset;

  /* Global scene data and textures */
  KernelData data;
@@ -36,7 +37,11 @@ struct KernelParamsOptiX {
 };

 #ifdef __NVCC__
-extern "C" static __constant__ KernelParamsOptiX kernel_params;
+extern "C"
+#  ifndef __CUDACC_RDC__
+    static
+#  endif
+    __constant__ KernelParamsOptiX kernel_params;
 #endif

 /* Abstraction macros */
--- a/intern/cycles/kernel/device/optix/kernel_osl.cu
+++ b/intern/cycles/kernel/device/optix/kernel_osl.cu
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+/* Copy of the regular OptiX kernels with additional OSL support. */
+
+#include "kernel/device/optix/kernel_shader_raytrace.cu"
+
+#include "kernel/bake/bake.h"
+#include "kernel/integrator/shade_background.h"
+#include "kernel/integrator/shade_light.h"
+#include "kernel/integrator/shade_shadow.h"
+#include "kernel/integrator/shade_volume.h"
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_background()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_background(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_light()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_light(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_surface()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_surface(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_volume()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_volume(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_integrator_shade_shadow()
+{
+  const int global_index = optixGetLaunchIndex().x;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
+  integrator_shade_shadow(nullptr, path_index, kernel_params.render_buffer);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_displace()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_displace_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_background()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_background_evaluate(nullptr, input, output, global_index);
+}
+
+extern "C" __global__ void __raygen__kernel_optix_shader_eval_curve_shadow_transparency()
+{
+  KernelShaderEvalInput *const input = (KernelShaderEvalInput *)kernel_params.path_index_array;
+  float *const output = kernel_params.render_buffer;
+  const int global_index = kernel_params.offset + optixGetLaunchIndex().x;
+  kernel_curve_shadow_transparency_evaluate(nullptr, input, output, global_index);
+}
--- a/intern/cycles/kernel/film/adaptive_sampling.h
+++ b/intern/cycles/kernel/film/adaptive_sampling.h
@@ -58,13 +58,29 @@ ccl_device bool film_adaptive_sampling_convergence_check(KernelGlobals kg,
  const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined);

  const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
-  const float inv_sample = 1.0f / sample;
+  const float intensity_scale = kernel_data.film.exposure / sample;

  /* The per pixel error as seen in section 2.1 of
   * "A hierarchical automatic stopping condition for Monte Carlo global illumination" */
  const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) *
-                                 inv_sample;
-  const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample);
+                                 intensity_scale;
+  const float intensity = (I.x + I.y + I.z) * intensity_scale;
+
+  /* Anything with R+G+B > 1 is highly exposed - even in sRGB it's a range that
+   * some displays aren't even able to display without significant losses in
+   * detalization. Everything with R+G+B > 3 is overexposed and should receive
+   * even less samples. Filmic-like curves need maximum sampling rate at
+   * intensity near 0.1-0.2, so threshold of 1 for R+G+B leaves an additional
+   * fstop in case it is needed for compositing.
+   */
+  float error_normalize;
+  if (intensity < 1.0f) {
+    error_normalize = sqrtf(intensity);
+  }
+  else {
+    error_normalize = intensity;
+  }
+
  /* A small epsilon is added to the divisor to prevent division by zero. */
  const float error = error_difference / (0.0001f + error_normalize);
  const bool did_converge = (error < threshold);
--- a/intern/cycles/kernel/film/data_passes.h
+++ b/intern/cycles/kernel/film/data_passes.h
@@ -157,47 +157,4 @@ ccl_device_inline void film_write_data_passes(KernelGlobals kg,
 #endif
 }

-ccl_device_inline void film_write_data_passes_background(
-    KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
-{
-#ifdef __PASSES__
-  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-
-  if (!(path_flag & PATH_RAY_TRANSPARENT_BACKGROUND)) {
-    return;
-  }
-
-  /* Don't write data passes for paths that were split off for shadow catchers
-   * to avoid double-counting. */
-  if (path_flag & PATH_RAY_SHADOW_CATCHER_PASS) {
-    return;
-  }
-
-  const int flag = kernel_data.film.pass_flag;
-
-  if (!(flag & PASS_ANY)) {
-    return;
-  }
-
-  if (!(path_flag & PATH_RAY_SINGLE_PASS_DONE)) {
-    ccl_global float *buffer = film_pass_pixel_render_buffer(kg, state, render_buffer);
-
-    if (INTEGRATOR_STATE(state, path, sample) == 0) {
-      if (flag & PASSMASK(DEPTH)) {
-        film_overwrite_pass_float(buffer + kernel_data.film.pass_depth, 0.0f);
-      }
-      if (flag & PASSMASK(OBJECT_ID)) {
-        film_overwrite_pass_float(buffer + kernel_data.film.pass_object_id, 0.0f);
-      }
-      if (flag & PASSMASK(MATERIAL_ID)) {
-        film_overwrite_pass_float(buffer + kernel_data.film.pass_material_id, 0.0f);
-      }
-      if (flag & PASSMASK(POSITION)) {
-        film_overwrite_pass_float3(buffer + kernel_data.film.pass_position, zero_float3());
-      }
-    }
-  }
-#endif
-}
-
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/displacement_shader.h
+++ b/intern/cycles/kernel/integrator/displacement_shader.h
@@ -24,8 +24,8 @@ ccl_device void displacement_shader_eval(KernelGlobals kg,

  /* this will modify sd->P */
 #ifdef __OSL__
-  if (kg->osl) {
-    OSLShader::eval_displacement(kg, state, sd);
+  if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+    osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(kg, state, sd, 0);
  }
  else
 #endif
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -3,7 +3,6 @@

 #pragma once

-#include "kernel/film/data_passes.h"
 #include "kernel/film/light_passes.h"

 #include "kernel/integrator/guiding.h"
@@ -132,7 +131,6 @@ ccl_device_inline void integrate_background(KernelGlobals kg,

  /* Write to render buffer. */
  film_write_background(kg, state, L, transparent, is_transparent_background_ray, render_buffer);
-  film_write_data_passes_background(kg, state, render_buffer);
 }

 ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
--- a/intern/cycles/kernel/integrator/surface_shader.h
+++ b/intern/cycles/kernel/integrator/surface_shader.h
@@ -827,13 +827,8 @@ ccl_device void surface_shader_eval(KernelGlobals kg,
  sd->num_closure_left = max_closures;

 #ifdef __OSL__
-  if (kg->osl) {
-    if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
-      OSLShader::eval_background(kg, state, sd, path_flag);
-    }
-    else {
-      OSLShader::eval_surface(kg, state, sd, path_flag);
-    }
+  if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+    osl_eval_nodes<SHADER_TYPE_SURFACE>(kg, state, sd, path_flag);
  }
  else
 #endif
--- a/intern/cycles/kernel/integrator/volume_shader.h
+++ b/intern/cycles/kernel/integrator/volume_shader.h
@@ -491,8 +491,8 @@ ccl_device_inline void volume_shader_eval(KernelGlobals kg,

    /* evaluate shader */
 #  ifdef __OSL__
-    if (kg->osl) {
-      OSLShader::eval_volume(kg, state, sd, path_flag);
+    if (kernel_data.kernel_features & KERNEL_FEATURE_OSL) {
+      osl_eval_nodes<SHADER_TYPE_VOLUME>(kg, state, sd, path_flag);
    }
    else
 #  endif
--- a/intern/cycles/kernel/osl/closures.cpp
+++ b/intern/cycles/kernel/osl/closures.cpp
@@ -25,13 +25,18 @@

 #include "kernel/osl/osl.h"

-#include "kernel/osl/closures_setup.h"
-
 #define TO_VEC3(v) OSL::Vec3(v.x, v.y, v.z)
 #define TO_FLOAT3(v) make_float3(v[0], v[1], v[2])

 CCL_NAMESPACE_BEGIN

+static_assert(sizeof(OSLClosure) == sizeof(OSL::ClosureColor) &&
+              sizeof(OSLClosureAdd) == sizeof(OSL::ClosureAdd) &&
+              sizeof(OSLClosureMul) == sizeof(OSL::ClosureMul) &&
+              sizeof(OSLClosureComponent) == sizeof(OSL::ClosureComponent));
+static_assert(sizeof(ShaderGlobals) == sizeof(OSL::ShaderGlobals) &&
+              offsetof(ShaderGlobals, Ci) == offsetof(OSL::ShaderGlobals, Ci));
+
 /* Registration */

 #define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
@@ -60,53 +65,18 @@ void OSLRenderServices::register_closures(OSL::ShadingSystem *ss)
 #include "closures_template.h"
 }

-/* Globals */
+/* Surface & Background */

-static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
-                                        ShaderData *sd,
-                                        const void *state,
-                                        uint32_t path_flag,
-                                        OSLThreadData *tdata)
+template<>
+void osl_eval_nodes<SHADER_TYPE_SURFACE>(const KernelGlobalsCPU *kg,
+                                         const void *state,
+                                         ShaderData *sd,
+                                         uint32_t path_flag)
 {
-  OSL::ShaderGlobals *globals = &tdata->globals;
-
-  const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
-  const differential3 dI = differential_from_compact(sd->I, sd->dI);
-
-  /* copy from shader data to shader globals */
-  globals->P = TO_VEC3(sd->P);
-  globals->dPdx = TO_VEC3(dP.dx);
-  globals->dPdy = TO_VEC3(dP.dy);
-  globals->I = TO_VEC3(sd->I);
-  globals->dIdx = TO_VEC3(dI.dx);
-  globals->dIdy = TO_VEC3(dI.dy);
-  globals->N = TO_VEC3(sd->N);
-  globals->Ng = TO_VEC3(sd->Ng);
-  globals->u = sd->u;
-  globals->dudx = sd->du.dx;
-  globals->dudy = sd->du.dy;
-  globals->v = sd->v;
-  globals->dvdx = sd->dv.dx;
-  globals->dvdy = sd->dv.dy;
-  globals->dPdu = TO_VEC3(sd->dPdu);
-  globals->dPdv = TO_VEC3(sd->dPdv);
-  globals->surfacearea = 1.0f;
-  globals->time = sd->time;
-
-  /* booleans */
-  globals->raytype = path_flag;
-  globals->flipHandedness = 0;
-  globals->backfacing = (sd->flag & SD_BACKFACING);
-
-  /* shader data to be used in services callbacks */
-  globals->renderstate = sd;
-
-  /* hacky, we leave it to services to fetch actual object matrix */
-  globals->shader2common = sd;
-  globals->object2common = sd;
-
-  /* must be set to NULL before execute */
-  globals->Ci = NULL;
+  /* setup shader globals from shader data */
+  OSLThreadData *tdata = kg->osl_tdata;
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));

  /* clear trace data */
  tdata->tracedata.init = false;
@@ -121,53 +91,6 @@ static void shaderdata_to_shaderglobals(const KernelGlobalsCPU *kg,
    sd->osl_path_state = (const IntegratorStateCPU *)state;
    sd->osl_shadow_path_state = nullptr;
  }
-}
-
-static void flatten_closure_tree(const KernelGlobalsCPU *kg,
-                                 ShaderData *sd,
-                                 uint32_t path_flag,
-                                 const OSL::ClosureColor *closure,
-                                 float3 weight = make_float3(1.0f, 1.0f, 1.0f))
-{
-  /* OSL gives us a closure tree, we flatten it into arrays per
-   * closure type, for evaluation, sampling, etc later on. */
-
-  switch (closure->id) {
-    case OSL::ClosureColor::MUL: {
-      OSL::ClosureMul *mul = (OSL::ClosureMul *)closure;
-      flatten_closure_tree(kg, sd, path_flag, mul->closure, TO_FLOAT3(mul->weight) * weight);
-      break;
-    }
-    case OSL::ClosureColor::ADD: {
-      OSL::ClosureAdd *add = (OSL::ClosureAdd *)closure;
-      flatten_closure_tree(kg, sd, path_flag, add->closureA, weight);
-      flatten_closure_tree(kg, sd, path_flag, add->closureB, weight);
-      break;
-    }
-#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
-  case OSL_CLOSURE_##Upper##_ID: { \
-    const OSL::ClosureComponent *comp = reinterpret_cast<const OSL::ClosureComponent *>(closure); \
-    weight *= TO_FLOAT3(comp->w); \
-    osl_closure_##lower##_setup( \
-        kg, sd, path_flag, weight, reinterpret_cast<const Upper##Closure *>(comp + 1)); \
-    break; \
-  }
-#include "closures_template.h"
-    default:
-      break;
-  }
-}
-
-/* Surface */
-
-void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
-                             const void *state,
-                             ShaderData *sd,
-                             uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);

  /* execute shader for this point */
  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
@@ -175,101 +98,99 @@ void OSLShader::eval_surface(const KernelGlobalsCPU *kg,
  OSL::ShadingContext *octx = tdata->context;
  int shader = sd->shader & SHADER_MASK;

-  /* automatic bump shader */
-  if (kg->osl->bump_state[shader]) {
-    /* save state */
-    const float3 P = sd->P;
-    const float dP = sd->dP;
-    const OSL::Vec3 dPdx = globals->dPdx;
-    const OSL::Vec3 dPdy = globals->dPdy;
+  if (sd->object == OBJECT_NONE && sd->lamp == LAMP_NONE) {
+    /* background */
+    if (kg->osl->background_state) {
+      ss->execute(octx, *(kg->osl->background_state), *globals);
+    }
+  }
+  else {
+    /* automatic bump shader */
+    if (kg->osl->bump_state[shader]) {
+      /* save state */
+      const float3 P = sd->P;
+      const float dP = sd->dP;
+      const OSL::Vec3 dPdx = globals->dPdx;
+      const OSL::Vec3 dPdy = globals->dPdy;

-    /* set state as if undisplaced */
-    if (sd->flag & SD_HAS_DISPLACEMENT) {
-      float data[9];
-      bool found = kg->osl->services->get_attribute(sd,
-                                                    true,
-                                                    OSLRenderServices::u_empty,
-                                                    TypeDesc::TypeVector,
-                                                    OSLRenderServices::u_geom_undisplaced,
-                                                    data);
-      (void)found;
-      assert(found);
+      /* set state as if undisplaced */
+      if (sd->flag & SD_HAS_DISPLACEMENT) {
+        float data[9];
+        bool found = kg->osl->services->get_attribute(sd,
+                                                      true,
+                                                      OSLRenderServices::u_empty,
+                                                      TypeDesc::TypeVector,
+                                                      OSLRenderServices::u_geom_undisplaced,
+                                                      data);
+        (void)found;
+        assert(found);

-      differential3 tmp_dP;
-      memcpy(&sd->P, data, sizeof(float) * 3);
-      memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
-      memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);
+        differential3 tmp_dP;
+        memcpy(&sd->P, data, sizeof(float) * 3);
+        memcpy(&tmp_dP.dx, data + 3, sizeof(float) * 3);
+        memcpy(&tmp_dP.dy, data + 6, sizeof(float) * 3);

-      object_position_transform(kg, sd, &sd->P);
-      object_dir_transform(kg, sd, &tmp_dP.dx);
-      object_dir_transform(kg, sd, &tmp_dP.dy);
+        object_position_transform(kg, sd, &sd->P);
+        object_dir_transform(kg, sd, &tmp_dP.dx);
+        object_dir_transform(kg, sd, &tmp_dP.dy);

-      sd->dP = differential_make_compact(tmp_dP);
+        sd->dP = differential_make_compact(tmp_dP);

-      globals->P = TO_VEC3(sd->P);
-      globals->dPdx = TO_VEC3(tmp_dP.dx);
-      globals->dPdy = TO_VEC3(tmp_dP.dy);
+        globals->P = TO_VEC3(sd->P);
+        globals->dPdx = TO_VEC3(tmp_dP.dx);
+        globals->dPdy = TO_VEC3(tmp_dP.dy);
+      }
+
+      /* execute bump shader */
+      ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
+
+      /* reset state */
+      sd->P = P;
+      sd->dP = dP;
+
+      globals->P = TO_VEC3(P);
+      globals->dPdx = TO_VEC3(dPdx);
+      globals->dPdy = TO_VEC3(dPdy);
    }

-    /* execute bump shader */
-    ss->execute(octx, *(kg->osl->bump_state[shader]), *globals);
-
-    /* reset state */
-    sd->P = P;
-    sd->dP = dP;
-
-    globals->P = TO_VEC3(P);
-    globals->dPdx = TO_VEC3(dPdx);
-    globals->dPdy = TO_VEC3(dPdy);
-  }
-
-  /* surface shader */
-  if (kg->osl->surface_state[shader]) {
-    ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+    /* surface shader */
+    if (kg->osl->surface_state[shader]) {
+      ss->execute(octx, *(kg->osl->surface_state[shader]), *globals);
+    }
  }

  /* flatten closure tree */
  if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
-  }
-}
-
-/* Background */
-
-void OSLShader::eval_background(const KernelGlobalsCPU *kg,
-                                const void *state,
-                                ShaderData *sd,
-                                uint32_t path_flag)
-{
-  /* setup shader globals from shader data */
-  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
-
-  /* execute shader for this point */
-  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
-  OSL::ShaderGlobals *globals = &tdata->globals;
-  OSL::ShadingContext *octx = tdata->context;
-
-  if (kg->osl->background_state) {
-    ss->execute(octx, *(kg->osl->background_state), *globals);
-  }
-
-  /* return background color immediately */
-  if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+    flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci));
  }
 }

 /* Volume */

-void OSLShader::eval_volume(const KernelGlobalsCPU *kg,
-                            const void *state,
-                            ShaderData *sd,
-                            uint32_t path_flag)
+template<>
+void osl_eval_nodes<SHADER_TYPE_VOLUME>(const KernelGlobalsCPU *kg,
+                                        const void *state,
+                                        ShaderData *sd,
+                                        uint32_t path_flag)
 {
  /* setup shader globals from shader data */
  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, path_flag, tdata);
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+
+  /* clear trace data */
+  tdata->tracedata.init = false;
+
+  /* Used by render-services. */
+  sd->osl_globals = kg;
+  if (path_flag & PATH_RAY_SHADOW) {
+    sd->osl_path_state = nullptr;
+    sd->osl_shadow_path_state = (const IntegratorShadowStateCPU *)state;
+  }
+  else {
+    sd->osl_path_state = (const IntegratorStateCPU *)state;
+    sd->osl_shadow_path_state = nullptr;
+  }

  /* execute shader */
  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
@@ -283,17 +204,30 @@ void OSLShader::eval_volume(const KernelGlobalsCPU *kg,

  /* flatten closure tree */
  if (globals->Ci) {
-    flatten_closure_tree(kg, sd, path_flag, globals->Ci);
+    flatten_closure_tree(kg, sd, path_flag, reinterpret_cast<OSLClosure *>(globals->Ci));
  }
 }

 /* Displacement */

-void OSLShader::eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd)
+template<>
+void osl_eval_nodes<SHADER_TYPE_DISPLACEMENT>(const KernelGlobalsCPU *kg,
+                                              const void *state,
+                                              ShaderData *sd,
+                                              uint32_t path_flag)
 {
  /* setup shader globals from shader data */
  OSLThreadData *tdata = kg->osl_tdata;
-  shaderdata_to_shaderglobals(kg, sd, state, 0, tdata);
+  shaderdata_to_shaderglobals(
+      kg, sd, path_flag, reinterpret_cast<ShaderGlobals *>(&tdata->globals));
+
+  /* clear trace data */
+  tdata->tracedata.init = false;
+
+  /* Used by render-services. */
+  sd->osl_globals = kg;
+  sd->osl_path_state = (const IntegratorStateCPU *)state;
+  sd->osl_shadow_path_state = nullptr;

  /* execute shader */
  OSL::ShadingSystem *ss = (OSL::ShadingSystem *)kg->osl_ss;
--- a/intern/cycles/kernel/osl/closures_setup.h
+++ b/intern/cycles/kernel/osl/closures_setup.h
@@ -40,12 +40,7 @@ CCL_NAMESPACE_BEGIN
    const char *label;
 #define OSL_CLOSURE_STRUCT_END(Upper, lower) \
  } \
-  ; \
-  ccl_device void osl_closure_##lower##_setup(KernelGlobals kg, \
-                                              ccl_private ShaderData *sd, \
-                                              uint32_t path_flag, \
-                                              float3 weight, \
-                                              ccl_private Upper##Closure *closure);
+  ;
 #define OSL_CLOSURE_STRUCT_MEMBER(Upper, TYPE, type, name, key) type name;
 #define OSL_CLOSURE_STRUCT_ARRAY_MEMBER(Upper, TYPE, type, name, key, size) type name[size];

@@ -210,11 +205,9 @@ ccl_device void osl_closure_microfacet_setup(KernelGlobals kg,
  bsdf->ior = closure->ior;
  bsdf->T = closure->T;

-  static OSL::ustring u_ggx("ggx");
-  static OSL::ustring u_default("default");
-
  /* GGX */
-  if (closure->distribution == u_ggx || closure->distribution == u_default) {
+  if (closure->distribution == make_string("ggx", 11253504724482777663ull) ||
+      closure->distribution == make_string("default", 4430693559278735917ull)) {
    if (!closure->refract) {
      if (closure->alpha_x == closure->alpha_y) {
        /* Isotropic */
@@ -1000,18 +993,14 @@ ccl_device void osl_closure_bssrdf_setup(KernelGlobals kg,
                                         float3 weight,
                                         ccl_private const BSSRDFClosure *closure)
 {
-  static ustring u_burley("burley");
-  static ustring u_random_walk_fixed_radius("random_walk_fixed_radius");
-  static ustring u_random_walk("random_walk");
-
  ClosureType type;
-  if (closure->method == u_burley) {
+  if (closure->method == make_string("burley", 186330084368958868ull)) {
    type = CLOSURE_BSSRDF_BURLEY_ID;
  }
-  else if (closure->method == u_random_walk_fixed_radius) {
+  else if (closure->method == make_string("random_walk_fixed_radius", 5695810351010063150ull)) {
    type = CLOSURE_BSSRDF_RANDOM_WALK_FIXED_RADIUS_ID;
  }
-  else if (closure->method == u_random_walk) {
+  else if (closure->method == make_string("random_walk", 11360609267673527222ull)) {
    type = CLOSURE_BSSRDF_RANDOM_WALK_ID;
  }
  else {
--- a/intern/cycles/kernel/osl/closures_template.h
+++ b/intern/cycles/kernel/osl/closures_template.h
@@ -40,7 +40,7 @@ OSL_CLOSURE_STRUCT_BEGIN(Transparent, transparent)
 OSL_CLOSURE_STRUCT_END(Transparent, transparent)

 OSL_CLOSURE_STRUCT_BEGIN(Microfacet, microfacet)
-  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, ustring, distribution, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, STRING, DeviceString, distribution, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, N, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, VECTOR, packed_float3, T, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(Microfacet, FLOAT, float, alpha_x, NULL)
@@ -210,7 +210,7 @@ OSL_CLOSURE_STRUCT_BEGIN(PhongRamp, phong_ramp)
 OSL_CLOSURE_STRUCT_END(PhongRamp, phong_ramp)

 OSL_CLOSURE_STRUCT_BEGIN(BSSRDF, bssrdf)
-  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, ustring, method, NULL)
+  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, STRING, DeviceString, method, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, N, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, radius, NULL)
  OSL_CLOSURE_STRUCT_MEMBER(BSSRDF, VECTOR, packed_float3, albedo, NULL)
--- a/intern/cycles/kernel/osl/osl.h
+++ b/intern/cycles/kernel/osl/osl.h
@@ -1,38 +1,171 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Adapted from Open Shading Language
+ * Copyright (c) 2009-2010 Sony Pictures Imageworks Inc., et al.
+ * All Rights Reserved.
+ *
+ * Modifications Copyright 2011-2022 Blender Foundation. */

 #pragma once

 /* OSL Shader Engine
 *
- * Holds all variables to execute and use OSL shaders from the kernel. These
- * are initialized externally by OSLShaderManager before rendering starts.
- *
- * Before/after a thread starts rendering, thread_init/thread_free must be
- * called, which will store any per thread OSL state in thread local storage.
- * This means no thread state must be passed along in the kernel itself.
+ * Holds all variables to execute and use OSL shaders from the kernel.
 */

 #include "kernel/osl/types.h"

+#include "kernel/osl/closures_setup.h"
+
 CCL_NAMESPACE_BEGIN

-class OSLShader {
- public:
-  /* eval */
-  static void eval_surface(const KernelGlobalsCPU *kg,
-                           const void *state,
-                           ShaderData *sd,
-                           uint32_t path_flag);
-  static void eval_background(const KernelGlobalsCPU *kg,
-                              const void *state,
-                              ShaderData *sd,
-                              uint32_t path_flag);
-  static void eval_volume(const KernelGlobalsCPU *kg,
-                          const void *state,
-                          ShaderData *sd,
-                          uint32_t path_flag);
-  static void eval_displacement(const KernelGlobalsCPU *kg, const void *state, ShaderData *sd);
-};
+ccl_device_inline void shaderdata_to_shaderglobals(KernelGlobals kg,
+                                                   ccl_private ShaderData *sd,
+                                                   uint32_t path_flag,
+                                                   ccl_private ShaderGlobals *globals)
+{
+  const differential3 dP = differential_from_compact(sd->Ng, sd->dP);
+  const differential3 dI = differential_from_compact(sd->I, sd->dI);
+
+  /* copy from shader data to shader globals */
+  globals->P = sd->P;
+  globals->dPdx = dP.dx;
+  globals->dPdy = dP.dy;
+  globals->I = sd->I;
+  globals->dIdx = dI.dx;
+  globals->dIdy = dI.dy;
+  globals->N = sd->N;
+  globals->Ng = sd->Ng;
+  globals->u = sd->u;
+  globals->dudx = sd->du.dx;
+  globals->dudy = sd->du.dy;
+  globals->v = sd->v;
+  globals->dvdx = sd->dv.dx;
+  globals->dvdy = sd->dv.dy;
+  globals->dPdu = sd->dPdu;
+  globals->dPdv = sd->dPdv;
+  globals->time = sd->time;
+  globals->dtime = 1.0f;
+  globals->surfacearea = 1.0f;
+  globals->raytype = path_flag;
+  globals->flipHandedness = 0;
+  globals->backfacing = (sd->flag & SD_BACKFACING);
+
+  /* shader data to be used in services callbacks */
+  globals->renderstate = sd;
+
+  /* hacky, we leave it to services to fetch actual object matrix */
+  globals->shader2common = sd;
+  globals->object2common = sd;
+
+  /* must be set to NULL before execute */
+  globals->Ci = nullptr;
+}
+
+ccl_device void flatten_closure_tree(KernelGlobals kg,
+                                     ccl_private ShaderData *sd,
+                                     uint32_t path_flag,
+                                     ccl_private const OSLClosure *closure)
+{
+  int stack_size = 0;
+  float3 weight = one_float3();
+  float3 weight_stack[16];
+  ccl_private const OSLClosure *closure_stack[16];
+
+  while (closure) {
+    switch (closure->id) {
+      case OSL_CLOSURE_MUL_ID: {
+        ccl_private const OSLClosureMul *mul = static_cast<ccl_private const OSLClosureMul *>(
+            closure);
+        weight *= mul->weight;
+        closure = mul->closure;
+        continue;
+      }
+      case OSL_CLOSURE_ADD_ID: {
+        if (stack_size >= 16) {
+          kernel_assert(!"Exhausted OSL closure stack");
+          break;
+        }
+        ccl_private const OSLClosureAdd *add = static_cast<ccl_private const OSLClosureAdd *>(
+            closure);
+        closure = add->closureA;
+        weight_stack[stack_size] = weight;
+        closure_stack[stack_size++] = add->closureB;
+        continue;
+      }
+#define OSL_CLOSURE_STRUCT_BEGIN(Upper, lower) \
+  case OSL_CLOSURE_##Upper##_ID: { \
+    ccl_private const OSLClosureComponent *comp = \
+        static_cast<ccl_private const OSLClosureComponent *>(closure); \
+    osl_closure_##lower##_setup(kg, \
+                                sd, \
+                                path_flag, \
+                                weight * comp->weight, \
+                                reinterpret_cast<ccl_private const Upper##Closure *>(comp + 1)); \
+    break; \
+  }
+#include "closures_template.h"
+      default:
+        break;
+    }
+
+    if (stack_size > 0) {
+      weight = weight_stack[--stack_size];
+      closure = closure_stack[stack_size];
+    }
+    else {
+      closure = nullptr;
+    }
+  }
+}
+
+#ifndef __KERNEL_GPU__
+
+template<ShaderType type>
+void osl_eval_nodes(const KernelGlobalsCPU *kg,
+                    const void *state,
+                    ShaderData *sd,
+                    uint32_t path_flag);
+
+#else
+
+template<ShaderType type, typename ConstIntegratorGenericState>
+ccl_device_inline void osl_eval_nodes(KernelGlobals kg,
+                                      ConstIntegratorGenericState state,
+                                      ccl_private ShaderData *sd,
+                                      uint32_t path_flag)
+{
+  ShaderGlobals globals;
+  shaderdata_to_shaderglobals(kg, sd, path_flag, &globals);
+
+  const int shader = sd->shader & SHADER_MASK;
+
+#  ifdef __KERNEL_OPTIX__
+  uint8_t group_data[2048];
+  uint8_t closure_pool[1024];
+  sd->osl_closure_pool = closure_pool;
+
+  unsigned int optix_dc_index = 2 /* NUM_CALLABLE_PROGRAM_GROUPS */ +
+                                (shader + type * kernel_data.max_shaders) * 2;
+  optixDirectCall<void>(optix_dc_index + 0,
+                        /* shaderglobals_ptr = */ &globals,
+                        /* groupdata_ptr = */ (void *)group_data,
+                        /* userdata_base_ptr = */ (void *)nullptr,
+                        /* output_base_ptr = */ (void *)nullptr,
+                        /* shadeindex = */ 0);
+  optixDirectCall<void>(optix_dc_index + 1,
+                        /* shaderglobals_ptr = */ &globals,
+                        /* groupdata_ptr = */ (void *)group_data,
+                        /* userdata_base_ptr = */ (void *)nullptr,
+                        /* output_base_ptr = */ (void *)nullptr,
+                        /* shadeindex = */ 0);
+#  endif
+
+  if (globals.Ci) {
+    flatten_closure_tree(kg, sd, path_flag, globals.Ci);
+  }
+}
+
+#endif

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -119,8 +119,8 @@ ustring OSLRenderServices::u_u("u");
 ustring OSLRenderServices::u_v("v");
 ustring OSLRenderServices::u_empty;

-OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system)
-    : OSL::RendererServices(texture_system)
+OSLRenderServices::OSLRenderServices(OSL::TextureSystem *texture_system, int device_type)
+    : OSL::RendererServices(texture_system), device_type_(device_type)
 {
 }

@@ -131,6 +131,17 @@ OSLRenderServices::~OSLRenderServices()
  }
 }

+int OSLRenderServices::supports(string_view feature) const
+{
+#ifdef WITH_OPTIX
+  if (feature == "OptiX") {
+    return device_type_ == DEVICE_OPTIX;
+  }
+#endif
+
+  return false;
+}
+
 bool OSLRenderServices::get_matrix(OSL::ShaderGlobals *sg,
                                   OSL::Matrix44 &result,
                                   OSL::TransformationPtr xform,
@@ -1139,29 +1150,40 @@ TextureSystem::TextureHandle *OSLRenderServices::get_texture_handle(ustring file
 {
  OSLTextureHandleMap::iterator it = textures.find(filename);

-  /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
-  if (it != textures.end()) {
-    if (it->second->type != OSLTextureHandle::OIIO) {
-      return (TextureSystem::TextureHandle *)it->second.get();
+  if (device_type_ == DEVICE_CPU) {
+    /* For non-OIIO textures, just return a pointer to our own OSLTextureHandle. */
+    if (it != textures.end()) {
+      if (it->second->type != OSLTextureHandle::OIIO) {
+        return (TextureSystem::TextureHandle *)it->second.get();
+      }
+    }
+
+    /* Get handle from OpenImageIO. */
+    OSL::TextureSystem *ts = m_texturesys;
+    TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
+    if (handle == NULL) {
+      return NULL;
+    }
+
+    /* Insert new OSLTextureHandle if needed. */
+    if (it == textures.end()) {
+      textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
+      it = textures.find(filename);
+    }
+
+    /* Assign OIIO texture handle and return. */
+    it->second->oiio_handle = handle;
+    return (TextureSystem::TextureHandle *)it->second.get();
+  }
+  else {
+    if (it != textures.end() && it->second->type == OSLTextureHandle::SVM &&
+        it->second->svm_slots[0].w == -1) {
+      return reinterpret_cast<TextureSystem::TextureHandle *>(
+          static_cast<uintptr_t>(it->second->svm_slots[0].y + 1));
    }
-  }

-  /* Get handle from OpenImageIO. */
-  OSL::TextureSystem *ts = m_texturesys;
-  TextureSystem::TextureHandle *handle = ts->get_texture_handle(filename);
-  if (handle == NULL) {
    return NULL;
  }
-
-  /* Insert new OSLTextureHandle if needed. */
-  if (it == textures.end()) {
-    textures.insert(filename, new OSLTextureHandle(OSLTextureHandle::OIIO));
-    it = textures.find(filename);
-  }
-
-  /* Assign OIIO texture handle and return. */
-  it->second->oiio_handle = handle;
-  return (TextureSystem::TextureHandle *)it->second.get();
 }

 bool OSLRenderServices::good(TextureSystem::TextureHandle *texture_handle)
--- a/intern/cycles/kernel/osl/services.h
+++ b/intern/cycles/kernel/osl/services.h
@@ -22,11 +22,8 @@ class PtexCache;

 CCL_NAMESPACE_BEGIN

-class Object;
 class Scene;
-class Shader;
 struct ShaderData;
-struct float3;
 struct KernelGlobalsCPU;

 /* OSL Texture Handle
@@ -73,11 +70,13 @@ typedef OIIO::unordered_map_concurrent<ustring, OSLTextureHandleRef, ustringHash

 class OSLRenderServices : public OSL::RendererServices {
 public:
-  OSLRenderServices(OSL::TextureSystem *texture_system);
+  OSLRenderServices(OSL::TextureSystem *texture_system, int device_type);
  ~OSLRenderServices();

  static void register_closures(OSL::ShadingSystem *ss);

+  int supports(string_view feature) const override;
+
  bool get_matrix(OSL::ShaderGlobals *sg,
                  OSL::Matrix44 &result,
                  OSL::TransformationPtr xform,
@@ -324,6 +323,9 @@ class OSLRenderServices : public OSL::RendererServices {
   * and is required because texture handles are cached as part of the shared
   * shading system. */
  OSLTextureHandleMap textures;
+
+ private:
+  int device_type_;
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/osl/services_gpu.h
+++ b/intern/cycles/kernel/osl/services_gpu.h
--- a/intern/cycles/kernel/osl/services_optix.cu
+++ b/intern/cycles/kernel/osl/services_optix.cu
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define WITH_OSL
+
+// clang-format off
+#include "kernel/device/optix/compat.h"
+#include "kernel/device/optix/globals.h"
+
+#include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */
+
+#include "kernel/osl/services_gpu.h"
+// clang-format on
+
+extern "C" __device__ void __direct_callable__dummy_services()
+{
+}
--- a/intern/cycles/kernel/osl/types.h
+++ b/intern/cycles/kernel/osl/types.h
@@ -5,9 +5,53 @@

 CCL_NAMESPACE_BEGIN

+struct DeviceString {
+#if defined(__KERNEL_GPU__)
+  /* Strings are represented by their hashes in CUDA and OptiX. */
+  size_t str_;
+
+  ccl_device_inline_method uint64_t hash() const
+  {
+    return str_;
+  }
+#elif defined(OPENIMAGEIO_USTRING_H)
+  ustring str_;
+
+  ccl_device_inline_method uint64_t hash() const
+  {
+    return str_.hash();
+  }
+#else
+  const char *str_;
+#endif
+
+  ccl_device_inline_method bool operator==(DeviceString b) const
+  {
+    return str_ == b.str_;
+  }
+  ccl_device_inline_method bool operator!=(DeviceString b) const
+  {
+    return str_ != b.str_;
+  }
+};
+
+ccl_device_inline DeviceString make_string(const char *str, size_t hash)
+{
+#if defined(__KERNEL_GPU__)
+  (void)str;
+  return {hash};
+#elif defined(OPENIMAGEIO_USTRING_H)
+  (void)hash;
+  return {ustring(str)};
+#else
+  (void)hash;
+  return {str};
+#endif
+}
+
 /* Closure */

-enum ClosureTypeOSL {
+enum OSLClosureType {
  OSL_CLOSURE_MUL_ID = -1,
  OSL_CLOSURE_ADD_ID = -2,

@@ -17,4 +61,60 @@ enum ClosureTypeOSL {
 #include "closures_template.h"
 };

+struct OSLClosure {
+  OSLClosureType id;
+};
+
+struct ccl_align(8) OSLClosureMul : public OSLClosure
+{
+  packed_float3 weight;
+  ccl_private const OSLClosure *closure;
+};
+
+struct ccl_align(8) OSLClosureAdd : public OSLClosure
+{
+  ccl_private const OSLClosure *closureA;
+  ccl_private const OSLClosure *closureB;
+};
+
+struct ccl_align(8) OSLClosureComponent : public OSLClosure
+{
+  packed_float3 weight;
+};
+
+/* Globals */
+
+struct ShaderGlobals {
+  packed_float3 P, dPdx, dPdy;
+  packed_float3 dPdz;
+  packed_float3 I, dIdx, dIdy;
+  packed_float3 N;
+  packed_float3 Ng;
+  float u, dudx, dudy;
+  float v, dvdx, dvdy;
+  packed_float3 dPdu, dPdv;
+  float time;
+  float dtime;
+  packed_float3 dPdtime;
+  packed_float3 Ps, dPsdx, dPsdy;
+  ccl_private void *renderstate;
+  ccl_private void *tracedata;
+  ccl_private void *objdata;
+  void *context;
+  void *renderer;
+  ccl_private void *object2common;
+  ccl_private void *shader2common;
+  ccl_private OSLClosure *Ci;
+  float surfacearea;
+  int raytype;
+  int flipHandedness;
+  int backfacing;
+};
+
+struct OSLNoiseOptions {
+};
+
+struct OSLTextureOptions {
+};
+
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/svm/noise.h
+++ b/intern/cycles/kernel/svm/noise.h
@@ -39,11 +39,11 @@ ccl_device_noinline_cpu float perlin_1d(float x)
 }

 /* 2D, 3D, and 4D noise can be accelerated using SSE, so we first check if
- * SSE is supported, that is, if __KERNEL_SSE2__ is defined. If it is not
+ * SSE is supported, that is, if __KERNEL_SSE__ is defined. If it is not
 * supported, we do a standard implementation, but if it is supported, we
 * do an implementation using SSE intrinsics.
 */
-#if !defined(__KERNEL_SSE2__)
+#if !defined(__KERNEL_SSE__)

 /* ** Standard Implementation ** */

@@ -250,18 +250,18 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)

 /* SSE Bilinear Interpolation:
 *
- * The function takes two ssef inputs:
+ * The function takes two float4 inputs:
 * - p : Contains the values at the points (v0, v1, v2, v3).
 * - f : Contains the values (x, y, _, _). The third and fourth values are unused.
 *
 * The interpolation is done in two steps:
 * 1. Interpolate (v0, v1) and (v2, v3) along the x axis to get g (g0, g1).
 *    (v2, v3) is generated by moving v2 and v3 to the first and second
- *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
 *    fourth values are unused.
 * 2. Interpolate g0 and g1 along the y axis to get the final value.
- *    g1 is generated by populating an ssef with the second value of g.
- *    Only the first value is important in the final ssef.
+ *    g1 is generated by populating an float4 with the second value of g.
+ *    Only the first value is important in the final float4.
 *
 * v1          v3          g1
 *  @ + + + + @            @                    y
@@ -272,27 +272,27 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 * v0          v2          g0
 *
 */
-ccl_device_inline ssef bi_mix(ssef p, ssef f)
+ccl_device_inline float4 bi_mix(float4 p, float4 f)
 {
-  ssef g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
+  float4 g = mix(p, shuffle<2, 3, 2, 3>(p), shuffle<0>(f));
  return mix(g, shuffle<1>(g), shuffle<1>(f));
 }

-ccl_device_inline ssef fade(const ssef &t)
+ccl_device_inline float4 fade(const float4 t)
 {
-  ssef a = madd(t, 6.0f, -15.0f);
-  ssef b = madd(t, a, 10.0f);
+  float4 a = madd(t, make_float4(6.0f), make_float4(-15.0f));
+  float4 b = madd(t, a, make_float4(10.0f));
  return (t * t) * (t * b);
 }

 /* Negate val if the nth bit of h is 1. */
 #  define negate_if_nth_bit(val, h, n) ((val) ^ cast(((h) & (1 << (n))) << (31 - (n))))

-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y)
 {
-  ssei h = hash & 7;
-  ssef u = select(h < 4, x, y);
-  ssef v = 2.0f * select(h < 4, y, x);
+  int4 h = hash & 7;
+  float4 u = select(h < 4, x, y);
+  float4 v = 2.0f * select(h < 4, y, x);
  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }

@@ -310,28 +310,28 @@ ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y)
 */
 ccl_device_noinline_cpu float perlin_2d(float x, float y)
 {
-  ssei XY;
-  ssef fxy = floorfrac(ssef(x, y, 0.0f, 0.0f), &XY);
-  ssef uv = fade(fxy);
+  int4 XY;
+  float4 fxy = floorfrac(make_float4(x, y, 0.0f, 0.0f), &XY);
+  float4 uv = fade(fxy);

-  ssei XY1 = XY + 1;
-  ssei X = shuffle<0, 0, 0, 0>(XY, XY1);
-  ssei Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));
+  int4 XY1 = XY + make_int4(1);
+  int4 X = shuffle<0, 0, 0, 0>(XY, XY1);
+  int4 Y = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(XY, XY1));

-  ssei h = hash_ssei2(X, Y);
+  int4 h = hash_int4_2(X, Y);

-  ssef fxy1 = fxy - 1.0f;
-  ssef fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
-  ssef fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));
+  float4 fxy1 = fxy - make_float4(1.0f);
+  float4 fx = shuffle<0, 0, 0, 0>(fxy, fxy1);
+  float4 fy = shuffle<0, 2, 0, 2>(shuffle<1, 1, 1, 1>(fxy, fxy1));

-  ssef g = grad(h, fx, fy);
+  float4 g = grad(h, fx, fy);

  return extract<0>(bi_mix(g, uv));
 }

 /* SSE Trilinear Interpolation:
 *
- * The function takes three ssef inputs:
+ * The function takes three float4 inputs:
 * - p : Contains the values at the points (v0, v1, v2, v3).
 * - q : Contains the values at the points (v4, v5, v6, v7).
 * - f : Contains the values (x, y, z, _). The fourth value is unused.
@@ -340,11 +340,11 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
 * 1. Interpolate p and q along the x axis to get s (s0, s1, s2, s3).
 * 2. Interpolate (s0, s1) and (s2, s3) along the y axis to get g (g0, g1).
 *    (s2, s3) is generated by moving v2 and v3 to the first and second
- *    places of the ssef using the shuffle mask <2, 3, 2, 3>. The third and
+ *    places of the float4 using the shuffle mask <2, 3, 2, 3>. The third and
 *    fourth values are unused.
 * 3. Interpolate g0 and g1 along the z axis to get the final value.
- *    g1 is generated by populating an ssef with the second value of g.
- *    Only the first value is important in the final ssef.
+ *    g1 is generated by populating an float4 with the second value of g.
+ *    Only the first value is important in the final float4.
 *
 *   v3               v7
 *     @ + + + + + + @               s3 @
@@ -362,10 +362,10 @@ ccl_device_noinline_cpu float perlin_2d(float x, float y)
 *          @ + + + + + + @                  @
 *        v0               v4                 s0
 */
-ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
+ccl_device_inline float4 tri_mix(float4 p, float4 q, float4 f)
 {
-  ssef s = mix(p, q, shuffle<0>(f));
-  ssef g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
+  float4 s = mix(p, q, shuffle<0>(f));
+  float4 g = mix(s, shuffle<2, 3, 2, 3>(s), shuffle<1>(f));
  return mix(g, shuffle<1>(g), shuffle<2>(f));
 }

@@ -374,24 +374,24 @@ ccl_device_inline ssef tri_mix(ssef p, ssef q, ssef f)
 * supported, we do an SSE implementation, but if it is supported,
 * we do an implementation using AVX intrinsics.
 */
-#  if !defined(__KERNEL_AVX__)
+#  if !defined(__KERNEL_AVX2__)

-ccl_device_inline ssef grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z)
+ccl_device_inline float4 grad(const int4 hash, const float4 x, const float4 y, const float4 z)
 {
-  ssei h = hash & 15;
-  ssef u = select(h < 8, x, y);
-  ssef vt = select((h == 12) | (h == 14), x, z);
-  ssef v = select(h < 4, y, vt);
+  int4 h = hash & 15;
+  float4 u = select(h < 8, x, y);
+  float4 vt = select((h == 12) | (h == 14), x, z);
+  float4 v = select(h < 4, y, vt);
  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }

-ccl_device_inline ssef
-grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &w)
+ccl_device_inline float4
+grad(const int4 hash, const float4 x, const float4 y, const float4 z, const float4 w)
 {
-  ssei h = hash & 31;
-  ssef u = select(h < 24, x, y);
-  ssef v = select(h < 16, y, z);
-  ssef s = select(h < 8, z, w);
+  int4 h = hash & 31;
+  float4 u = select(h < 24, x, y);
+  float4 v = select(h < 16, y, z);
+  float4 s = select(h < 8, z, w);
  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }

@@ -401,7 +401,7 @@ grad(const ssei &hash, const ssef &x, const ssef &y, const ssef &z, const ssef &
 * between two trilinear interpolations.
 *
 */
-ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
+ccl_device_inline float4 quad_mix(float4 p, float4 q, float4 r, float4 s, float4 f)
 {
  return mix(tri_mix(p, q, f), tri_mix(r, s, f), shuffle<3>(f));
 }
@@ -427,23 +427,23 @@ ccl_device_inline ssef quad_mix(ssef p, ssef q, ssef r, ssef s, ssef f)
 */
 ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
-  ssei XYZ;
-  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
-  ssef uvw = fade(fxyz);
+  int4 XYZ;
+  float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+  float4 uvw = fade(fxyz);

-  ssei XYZ1 = XYZ + 1;
-  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+  int4 XYZ1 = XYZ + make_int4(1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));

-  ssei h1 = hash_ssei3(shuffle<0>(XYZ), Y, Z);
-  ssei h2 = hash_ssei3(shuffle<0>(XYZ1), Y, Z);
+  int4 h1 = hash_int4_3(shuffle<0>(XYZ), Y, Z);
+  int4 h2 = hash_int4_3(shuffle<0>(XYZ1), Y, Z);

-  ssef fxyz1 = fxyz - 1.0f;
-  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+  float4 fxyz1 = fxyz - make_float4(1.0f);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));

-  ssef g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
-  ssef g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);
+  float4 g1 = grad(h1, shuffle<0>(fxyz), fy, fz);
+  float4 g2 = grad(h2, shuffle<0>(fxyz1), fy, fz);

  return extract<0>(tri_mix(g1, g2, uvw));
 }
@@ -481,29 +481,29 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 */
 ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
-  ssei XYZW;
-  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
-  ssef uvws = fade(fxyzw);
+  int4 XYZW;
+  float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW);
+  float4 uvws = fade(fxyzw);

-  ssei XYZW1 = XYZW + 1;
-  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+  int4 XYZW1 = XYZW + make_int4(1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));

-  ssei h1 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
-  ssei h2 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));
+  int4 h1 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW));
+  int4 h2 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW));

-  ssei h3 = hash_ssei4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
-  ssei h4 = hash_ssei4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));
+  int4 h3 = hash_int4_4(shuffle<0>(XYZW), Y, Z, shuffle<3>(XYZW1));
+  int4 h4 = hash_int4_4(shuffle<0>(XYZW1), Y, Z, shuffle<3>(XYZW1));

-  ssef fxyzw1 = fxyzw - 1.0f;
-  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+  float4 fxyzw1 = fxyzw - make_float4(1.0f);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));

-  ssef g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
-  ssef g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));
+  float4 g1 = grad(h1, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw));
+  float4 g2 = grad(h2, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw));

-  ssef g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
-  ssef g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));
+  float4 g3 = grad(h3, shuffle<0>(fxyzw), fy, fz, shuffle<3>(fxyzw1));
+  float4 g4 = grad(h4, shuffle<0>(fxyzw1), fy, fz, shuffle<3>(fxyzw1));

  return extract<0>(quad_mix(g1, g2, g3, g4, uvws));
 }
@@ -512,22 +512,22 @@ ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)

 /* AVX Implementation */

-ccl_device_inline avxf grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z)
+ccl_device_inline vfloat8 grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z)
 {
-  avxi h = hash & 15;
-  avxf u = select(h < 8, x, y);
-  avxf vt = select((h == 12) | (h == 14), x, z);
-  avxf v = select(h < 4, y, vt);
+  vint8 h = hash & 15;
+  vfloat8 u = select(h < 8, x, y);
+  vfloat8 vt = select((h == 12) | (h == 14), x, z);
+  vfloat8 v = select(h < 4, y, vt);
  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1);
 }

-ccl_device_inline avxf
-grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &w)
+ccl_device_inline vfloat8
+grad(const vint8 hash, const vfloat8 x, const vfloat8 y, const vfloat8 z, const vfloat8 w)
 {
-  avxi h = hash & 31;
-  avxf u = select(h < 24, x, y);
-  avxf v = select(h < 16, y, z);
-  avxf s = select(h < 8, z, w);
+  vint8 h = hash & 31;
+  vfloat8 u = select(h < 24, x, y);
+  vfloat8 v = select(h < 16, y, z);
+  vfloat8 s = select(h < 8, z, w);
  return negate_if_nth_bit(u, h, 0) + negate_if_nth_bit(v, h, 1) + negate_if_nth_bit(s, h, 2);
 }

@@ -537,13 +537,13 @@ grad(const avxi &hash, const avxf &x, const avxf &y, const avxf &z, const avxf &
 * 1. Interpolate p and q along the w axis to get s.
 * 2. Trilinearly interpolate (s0, s1, s2, s3) and (s4, s5, s6, s7) to get the final
 *    value. (s0, s1, s2, s3) and (s4, s5, s6, s7) are generated by extracting the
- *    low and high ssef from s.
+ *    low and high float4 from s.
 *
 */
-ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
+ccl_device_inline float4 quad_mix(vfloat8 p, vfloat8 q, float4 f)
 {
-  ssef fv = shuffle<3>(f);
-  avxf s = mix(p, q, avxf(fv, fv));
+  float4 fv = shuffle<3>(f);
+  vfloat8 s = mix(p, q, make_vfloat8(fv, fv));
  return tri_mix(low(s), high(s), f);
 }

@@ -565,25 +565,25 @@ ccl_device_inline ssef quad_mix(avxf p, avxf q, ssef f)
 */
 ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 {
-  ssei XYZ;
-  ssef fxyz = floorfrac(ssef(x, y, z, 0.0f), &XYZ);
-  ssef uvw = fade(fxyz);
+  int4 XYZ;
+  float4 fxyz = floorfrac(make_float4(x, y, z, 0.0f), &XYZ);
+  float4 uvw = fade(fxyz);

-  ssei XYZ1 = XYZ + 1;
-  ssei X = shuffle<0>(XYZ);
-  ssei X1 = shuffle<0>(XYZ1);
-  ssei Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));
+  int4 XYZ1 = XYZ + make_int4(1);
+  int4 X = shuffle<0>(XYZ);
+  int4 X1 = shuffle<0>(XYZ1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZ, XYZ1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZ, XYZ1));

-  avxi h = hash_avxi3(avxi(X, X1), avxi(Y, Y), avxi(Z, Z));
+  vint8 h = hash_int8_3(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z));

-  ssef fxyz1 = fxyz - 1.0f;
-  ssef fx = shuffle<0>(fxyz);
-  ssef fx1 = shuffle<0>(fxyz1);
-  ssef fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));
+  float4 fxyz1 = fxyz - make_float4(1.0f);
+  float4 fx = shuffle<0>(fxyz);
+  float4 fx1 = shuffle<0>(fxyz1);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyz, fxyz1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyz, fxyz1));

-  avxf g = grad(h, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz));
+  vfloat8 g = grad(h, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz));

  return extract<0>(tri_mix(low(g), high(g), uvw));
 }
@@ -617,31 +617,37 @@ ccl_device_noinline_cpu float perlin_3d(float x, float y, float z)
 */
 ccl_device_noinline_cpu float perlin_4d(float x, float y, float z, float w)
 {
-  ssei XYZW;
-  ssef fxyzw = floorfrac(ssef(x, y, z, w), &XYZW);
-  ssef uvws = fade(fxyzw);
+  int4 XYZW;
+  float4 fxyzw = floorfrac(make_float4(x, y, z, w), &XYZW);
+  float4 uvws = fade(fxyzw);

-  ssei XYZW1 = XYZW + 1;
-  ssei X = shuffle<0>(XYZW);
-  ssei X1 = shuffle<0>(XYZW1);
-  ssei Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
-  ssei Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
-  ssei W = shuffle<3>(XYZW);
-  ssei W1 = shuffle<3>(XYZW1);
+  int4 XYZW1 = XYZW + make_int4(1);
+  int4 X = shuffle<0>(XYZW);
+  int4 X1 = shuffle<0>(XYZW1);
+  int4 Y = shuffle<1, 1, 1, 1>(XYZW, XYZW1);
+  int4 Z = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(XYZW, XYZW1));
+  int4 W = shuffle<3>(XYZW);
+  int4 W1 = shuffle<3>(XYZW1);

-  avxi h1 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W, W));
-  avxi h2 = hash_avxi4(avxi(X, X1), avxi(Y, Y), avxi(Z, Z), avxi(W1, W1));
+  vint8 h1 = hash_int8_4(make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W, W));
+  vint8 h2 = hash_int8_4(
+      make_vint8(X, X1), make_vint8(Y, Y), make_vint8(Z, Z), make_vint8(W1, W1));

-  ssef fxyzw1 = fxyzw - 1.0f;
-  ssef fx = shuffle<0>(fxyzw);
-  ssef fx1 = shuffle<0>(fxyzw1);
-  ssef fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
-  ssef fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
-  ssef fw = shuffle<3>(fxyzw);
-  ssef fw1 = shuffle<3>(fxyzw1);
+  float4 fxyzw1 = fxyzw - make_float4(1.0f);
+  float4 fx = shuffle<0>(fxyzw);
+  float4 fx1 = shuffle<0>(fxyzw1);
+  float4 fy = shuffle<1, 1, 1, 1>(fxyzw, fxyzw1);
+  float4 fz = shuffle<0, 2, 0, 2>(shuffle<2, 2, 2, 2>(fxyzw, fxyzw1));
+  float4 fw = shuffle<3>(fxyzw);
+  float4 fw1 = shuffle<3>(fxyzw1);

-  avxf g1 = grad(h1, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw, fw));
-  avxf g2 = grad(h2, avxf(fx, fx1), avxf(fy, fy), avxf(fz, fz), avxf(fw1, fw1));
+  vfloat8 g1 = grad(
+      h1, make_vfloat8(fx, fx1), make_vfloat8(fy, fy), make_vfloat8(fz, fz), make_vfloat8(fw, fw));
+  vfloat8 g2 = grad(h2,
+                    make_vfloat8(fx, fx1),
+                    make_vfloat8(fy, fy),
+                    make_vfloat8(fz, fz),
+                    make_vfloat8(fw1, fw1));

  return extract<0>(quad_mix(g1, g2, uvws));
 }
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -75,10 +75,14 @@ CCL_NAMESPACE_BEGIN
 #define __VOLUME__

 /* Device specific features */
-#ifndef __KERNEL_GPU__
-#  ifdef WITH_OSL
-#    define __OSL__
+#ifdef WITH_OSL
+#  define __OSL__
+#  ifdef __KERNEL_OPTIX__
+/* Kernels with OSL support are built separately in OptiX and don't need SVM. */
+#    undef __SVM__
 #  endif
+#endif
+#ifndef __KERNEL_GPU__
 #  ifdef WITH_PATH_GUIDING
 #    define __PATH_GUIDING__
 #  endif
@@ -918,9 +922,13 @@ typedef struct ccl_align(16) ShaderData
  float ray_dP;

 #ifdef __OSL__
+#  ifdef __KERNEL_GPU__
+  ccl_private uint8_t *osl_closure_pool;
+#  else
  const struct KernelGlobalsCPU *osl_globals;
  const struct IntegratorStateCPU *osl_path_state;
  const struct IntegratorShadowStateCPU *osl_shadow_path_state;
+#  endif
 #endif

  /* LCG state for closures that require additional random numbers. */
@@ -1531,6 +1539,9 @@ enum KernelFeatureFlag : uint32_t {

  /* Path guiding. */
  KERNEL_FEATURE_PATH_GUIDING = (1U << 26U),
+
+  /* OSL. */
+  KERNEL_FEATURE_OSL = (1U << 27U),
 };

 /* Shader node feature mask, to specialize shader evaluation for kernels. */
--- a/intern/cycles/scene/constant_fold.cpp
+++ b/intern/cycles/scene/constant_fold.cpp
@@ -386,46 +386,6 @@ void ConstantFolder::fold_mix_color(NodeMix type, bool clamp_factor, bool clamp)
  }
 }

-void ConstantFolder::fold_mix_float(bool clamp_factor, bool clamp) const
-{
-  ShaderInput *fac_in = node->input("Factor");
-  ShaderInput *float1_in = node->input("A");
-  ShaderInput *float2_in = node->input("B");
-
-  float fac = clamp_factor ? saturatef(node->get_float(fac_in->socket_type)) :
-                             node->get_float(fac_in->socket_type);
-  bool fac_is_zero = !fac_in->link && fac == 0.0f;
-  bool fac_is_one = !fac_in->link && fac == 1.0f;
-
-  /* remove no-op node when factor is 0.0 */
-  if (fac_is_zero) {
-    if (try_bypass_or_make_constant(float1_in, clamp)) {
-      return;
-    }
-  }
-
-  /* remove useless mix floats nodes */
-  if (float1_in->link && float2_in->link) {
-    if (float1_in->link == float2_in->link) {
-      try_bypass_or_make_constant(float1_in, clamp);
-      return;
-    }
-  }
-  else if (!float1_in->link && !float2_in->link) {
-    float value1 = node->get_float(float1_in->socket_type);
-    float value2 = node->get_float(float2_in->socket_type);
-    if (value1 == value2) {
-      try_bypass_or_make_constant(float1_in, clamp);
-      return;
-    }
-  }
-  /* remove no-op mix float node when factor is 1.0 */
-  if (fac_is_one) {
-    try_bypass_or_make_constant(float2_in, clamp);
-    return;
-  }
-}
-
 void ConstantFolder::fold_math(NodeMathType type) const
 {
  ShaderInput *value1_in = node->input("Value1");
--- a/intern/cycles/scene/constant_fold.h
+++ b/intern/cycles/scene/constant_fold.h
@@ -52,7 +52,6 @@ class ConstantFolder {
  /* Specific nodes. */
  void fold_mix(NodeMix type, bool clamp) const;
  void fold_mix_color(NodeMix type, bool clamp_factor, bool clamp) const;
-  void fold_mix_float(bool clamp_factor, bool clamp) const;
  void fold_math(NodeMathType type) const;
  void fold_vector_math(NodeVectorMathType type) const;
  void fold_mapping(NodeMappingType type) const;
--- a/intern/cycles/scene/osl.cpp
+++ b/intern/cycles/scene/osl.cpp
@@ -38,16 +38,17 @@ OSL::TextureSystem *OSLShaderManager::ts_shared = NULL;
 int OSLShaderManager::ts_shared_users = 0;
 thread_mutex OSLShaderManager::ts_shared_mutex;

-OSL::ShadingSystem *OSLShaderManager::ss_shared = NULL;
-OSLRenderServices *OSLShaderManager::services_shared = NULL;
+OSL::ErrorHandler OSLShaderManager::errhandler;
+map<int, OSL::ShadingSystem *> OSLShaderManager::ss_shared;
 int OSLShaderManager::ss_shared_users = 0;
 thread_mutex OSLShaderManager::ss_shared_mutex;
 thread_mutex OSLShaderManager::ss_mutex;
+
 int OSLCompiler::texture_shared_unique_id = 0;

 /* Shader Manager */

-OSLShaderManager::OSLShaderManager()
+OSLShaderManager::OSLShaderManager(Device *device) : device_(device)
 {
  texture_system_init();
  shading_system_init();
@@ -107,11 +108,12 @@ void OSLShaderManager::device_update_specific(Device *device,

  device_free(device, dscene, scene);

-  /* set texture system */
-  scene->image_manager->set_osl_texture_system((void *)ts);
+  /* set texture system (only on CPU devices, since GPU devices cannot use OIIO) */
+  if (device->info.type == DEVICE_CPU) {
+    scene->image_manager->set_osl_texture_system((void *)ts_shared);
+  }

  /* create shaders */
-  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
  Shader *background_shader = scene->background->get_shader(scene);

  foreach (Shader *shader, scene->shaders) {
@@ -125,22 +127,34 @@ void OSLShaderManager::device_update_specific(Device *device,
     * compile shaders alternating */
    thread_scoped_lock lock(ss_mutex);

-    OSLCompiler compiler(this, services, ss, scene);
-    compiler.background = (shader == background_shader);
-    compiler.compile(og, shader);
+    device->foreach_device(
+        [this, scene, shader, background = (shader == background_shader)](Device *sub_device) {
+          OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+          OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
+
+          OSLCompiler compiler(this, ss, scene);
+          compiler.background = background;
+          compiler.compile(og, shader);
+        });

    if (shader->get_use_mis() && shader->has_surface_emission)
      scene->light_manager->tag_update(scene, LightManager::SHADER_COMPILED);
  }

  /* setup shader engine */
-  og->ss = ss;
-  og->ts = ts;
-  og->services = services;
-
  int background_id = scene->shader_manager->get_shader_id(background_shader);
-  og->background_state = og->surface_state[background_id & SHADER_MASK];
-  og->use = true;
+
+  device->foreach_device([background_id](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();
+    OSL::ShadingSystem *ss = ss_shared[sub_device->info.type];
+
+    og->ss = ss;
+    og->ts = ts_shared;
+    og->services = static_cast<OSLRenderServices *>(ss->renderer());
+
+    og->background_state = og->surface_state[background_id & SHADER_MASK];
+    og->use = true;
+  });

  foreach (Shader *shader, scene->shaders)
    shader->clear_modified();
@@ -148,8 +162,12 @@ void OSLShaderManager::device_update_specific(Device *device,
  update_flags = UPDATE_NONE;

  /* add special builtin texture types */
-  services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
-  services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
+  for (const auto &[device_type, ss] : ss_shared) {
+    OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer());
+
+    services->textures.insert(ustring("@ao"), new OSLTextureHandle(OSLTextureHandle::AO));
+    services->textures.insert(ustring("@bevel"), new OSLTextureHandle(OSLTextureHandle::BEVEL));
+  }

  device_update_common(device, dscene, scene, progress);

@@ -166,26 +184,35 @@ void OSLShaderManager::device_update_specific(Device *device,
     * is being freed after the Session is freed.
     */
    thread_scoped_lock lock(ss_shared_mutex);
-    ss->optimize_all_groups();
+    for (const auto &[device_type, ss] : ss_shared) {
+      ss->optimize_all_groups();
+    }
+  }
+
+  /* load kernels */
+  if (!device->load_osl_kernels()) {
+    progress.set_error(device->error_message());
  }
 }

 void OSLShaderManager::device_free(Device *device, DeviceScene *dscene, Scene *scene)
 {
-  OSLGlobals *og = (OSLGlobals *)device->get_cpu_osl_memory();
-
  device_free_common(device, dscene, scene);

  /* clear shader engine */
-  og->use = false;
-  og->ss = NULL;
-  og->ts = NULL;
+  device->foreach_device([](Device *sub_device) {
+    OSLGlobals *og = (OSLGlobals *)sub_device->get_cpu_osl_memory();

-  og->surface_state.clear();
-  og->volume_state.clear();
-  og->displacement_state.clear();
-  og->bump_state.clear();
-  og->background_state.reset();
+    og->use = false;
+    og->ss = NULL;
+    og->ts = NULL;
+
+    og->surface_state.clear();
+    og->volume_state.clear();
+    og->displacement_state.clear();
+    og->bump_state.clear();
+    og->background_state.reset();
+  });
 }

 void OSLShaderManager::texture_system_init()
@@ -193,7 +220,7 @@ void OSLShaderManager::texture_system_init()
  /* create texture system, shared between different renders to reduce memory usage */
  thread_scoped_lock lock(ts_shared_mutex);

-  if (ts_shared_users == 0) {
+  if (ts_shared_users++ == 0) {
    ts_shared = TextureSystem::create(true);

    ts_shared->attribute("automip", 1);
@@ -203,24 +230,18 @@ void OSLShaderManager::texture_system_init()
    /* effectively unlimited for now, until we support proper mipmap lookups */
    ts_shared->attribute("max_memory_MB", 16384);
  }
-
-  ts = ts_shared;
-  ts_shared_users++;
 }

 void OSLShaderManager::texture_system_free()
 {
  /* shared texture system decrease users and destroy if no longer used */
  thread_scoped_lock lock(ts_shared_mutex);
-  ts_shared_users--;

-  if (ts_shared_users == 0) {
+  if (--ts_shared_users == 0) {
    ts_shared->invalidate_all(true);
    OSL::TextureSystem::destroy(ts_shared);
    ts_shared = NULL;
  }
-
-  ts = NULL;
 }

 void OSLShaderManager::shading_system_init()
@@ -228,101 +249,105 @@ void OSLShaderManager::shading_system_init()
  /* create shading system, shared between different renders to reduce memory usage */
  thread_scoped_lock lock(ss_shared_mutex);

-  if (ss_shared_users == 0) {
-    /* Must use aligned new due to concurrent hash map. */
-    services_shared = util_aligned_new<OSLRenderServices>(ts_shared);
+  device_->foreach_device([](Device *sub_device) {
+    const DeviceType device_type = sub_device->info.type;

-    string shader_path = path_get("shader");
+    if (ss_shared_users++ == 0 || ss_shared.find(device_type) == ss_shared.end()) {
+      /* Must use aligned new due to concurrent hash map. */
+      OSLRenderServices *services = util_aligned_new<OSLRenderServices>(ts_shared, device_type);
+
+      string shader_path = path_get("shader");
 #  ifdef _WIN32
-    /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can
-     * operate with file paths with any character. This requires to use wide
-     * char functions, but OSL uses old fashioned ANSI functions which means:
-     *
-     * - We have to convert our paths to ANSI before passing to OSL
-     * - OSL can't be used when there's a multi-byte character in the path
-     *   to the shaders folder.
-     */
-    shader_path = string_to_ansi(shader_path);
+      /* Annoying thing, Cycles stores paths in UTF-8 codepage, so it can
+       * operate with file paths with any character. This requires to use wide
+       * char functions, but OSL uses old fashioned ANSI functions which means:
+       *
+       * - We have to convert our paths to ANSI before passing to OSL
+       * - OSL can't be used when there's a multi-byte character in the path
+       *   to the shaders folder.
+       */
+      shader_path = string_to_ansi(shader_path);
 #  endif

-    ss_shared = new OSL::ShadingSystem(services_shared, ts_shared, &errhandler);
-    ss_shared->attribute("lockgeom", 1);
-    ss_shared->attribute("commonspace", "world");
-    ss_shared->attribute("searchpath:shader", shader_path);
-    ss_shared->attribute("greedyjit", 1);
+      OSL::ShadingSystem *ss = new OSL::ShadingSystem(services, ts_shared, &errhandler);
+      ss->attribute("lockgeom", 1);
+      ss->attribute("commonspace", "world");
+      ss->attribute("searchpath:shader", shader_path);
+      ss->attribute("greedyjit", 1);

-    VLOG_INFO << "Using shader search path: " << shader_path;
+      VLOG_INFO << "Using shader search path: " << shader_path;

-    /* our own ray types */
-    static const char *raytypes[] = {
-        "camera",         /* PATH_RAY_CAMERA */
-        "reflection",     /* PATH_RAY_REFLECT */
-        "refraction",     /* PATH_RAY_TRANSMIT */
-        "diffuse",        /* PATH_RAY_DIFFUSE */
-        "glossy",         /* PATH_RAY_GLOSSY */
-        "singular",       /* PATH_RAY_SINGULAR */
-        "transparent",    /* PATH_RAY_TRANSPARENT */
-        "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */
+      /* our own ray types */
+      static const char *raytypes[] = {
+          "camera",         /* PATH_RAY_CAMERA */
+          "reflection",     /* PATH_RAY_REFLECT */
+          "refraction",     /* PATH_RAY_TRANSMIT */
+          "diffuse",        /* PATH_RAY_DIFFUSE */
+          "glossy",         /* PATH_RAY_GLOSSY */
+          "singular",       /* PATH_RAY_SINGULAR */
+          "transparent",    /* PATH_RAY_TRANSPARENT */
+          "volume_scatter", /* PATH_RAY_VOLUME_SCATTER */

-        "shadow", /* PATH_RAY_SHADOW_OPAQUE */
-        "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */
+          "shadow", /* PATH_RAY_SHADOW_OPAQUE */
+          "shadow", /* PATH_RAY_SHADOW_TRANSPARENT */

-        "__unused__", /* PATH_RAY_NODE_UNALIGNED */
-        "__unused__", /* PATH_RAY_MIS_SKIP */
+          "__unused__", /* PATH_RAY_NODE_UNALIGNED */
+          "__unused__", /* PATH_RAY_MIS_SKIP */

-        "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */
+          "diffuse_ancestor", /* PATH_RAY_DIFFUSE_ANCESTOR */

-        /* Remaining irrelevant bits up to 32. */
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-        "__unused__",
-    };
+          /* Remaining irrelevant bits up to 32. */
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+          "__unused__",
+      };

-    const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
-    ss_shared->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes);
+      const int nraytypes = sizeof(raytypes) / sizeof(raytypes[0]);
+      ss->attribute("raytypes", TypeDesc(TypeDesc::STRING, nraytypes), raytypes);

-    OSLRenderServices::register_closures(ss_shared);
+      OSLRenderServices::register_closures(ss);

-    loaded_shaders.clear();
-  }
+      ss_shared[device_type] = ss;
+    }
+  });

-  ss = ss_shared;
-  services = services_shared;
-  ss_shared_users++;
+  loaded_shaders.clear();
 }

 void OSLShaderManager::shading_system_free()
 {
  /* shared shading system decrease users and destroy if no longer used */
  thread_scoped_lock lock(ss_shared_mutex);
-  ss_shared_users--;

-  if (ss_shared_users == 0) {
-    delete ss_shared;
-    ss_shared = NULL;
+  device_->foreach_device([](Device * /*sub_device*/) {
+    if (--ss_shared_users == 0) {
+      for (const auto &[device_type, ss] : ss_shared) {
+        OSLRenderServices *services = static_cast<OSLRenderServices *>(ss->renderer());

-    util_aligned_delete(services_shared);
-    services_shared = NULL;
-  }
+        delete ss;

-  ss = NULL;
-  services = NULL;
+        util_aligned_delete(services);
+      }
+
+      ss_shared.clear();
+    }
+  });
 }

 bool OSLShaderManager::osl_compile(const string &inputfile, const string &outputfile)
@@ -447,7 +472,9 @@ const char *OSLShaderManager::shader_load_filepath(string filepath)

 const char *OSLShaderManager::shader_load_bytecode(const string &hash, const string &bytecode)
 {
-  ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str());
+  for (const auto &[device_type, ss] : ss_shared) {
+    ss->LoadMemoryCompiledShader(hash.c_str(), bytecode.c_str());
+  }

  OSLShaderInfo info;

@@ -614,11 +641,11 @@ OSLNode *OSLShaderManager::osl_node(ShaderGraph *graph,

 /* Graph Compiler */

-OSLCompiler::OSLCompiler(OSLShaderManager *manager,
-                         OSLRenderServices *services,
-                         OSL::ShadingSystem *ss,
-                         Scene *scene)
-    : scene(scene), manager(manager), services(services), ss(ss)
+OSLCompiler::OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *ss, Scene *scene)
+    : scene(scene),
+      manager(manager),
+      services(static_cast<OSLRenderServices *>(ss->renderer())),
+      ss(ss)
 {
  current_type = SHADER_TYPE_SURFACE;
  current_shader = NULL;
@@ -629,6 +656,8 @@ string OSLCompiler::id(ShaderNode *node)
 {
  /* assign layer unique name based on pointer address + bump mode */
  stringstream stream;
+  stream.imbue(std::locale("C")); /* Ensure that no grouping characters (e.g. commas with en_US
+                                     locale) are added to the pointer string */
  stream << "node_" << node->type->name << "_" << node;

  return stream.str();
@@ -1124,7 +1153,12 @@ OSL::ShaderGroupRef OSLCompiler::compile_type(Shader *shader, ShaderGraph *graph
 {
  current_type = type;

-  OSL::ShaderGroupRef group = ss->ShaderGroupBegin(shader->name.c_str());
+  /* Use name hash to identify shader group to avoid issues with non-alphanumeric characters */
+  stringstream name;
+  name.imbue(std::locale("C"));
+  name << "shader_" << shader->name.hash();
+
+  OSL::ShaderGroupRef group = ss->ShaderGroupBegin(name.str());

  ShaderNode *output = graph->output();
  ShaderNodeSet dependencies;
--- a/intern/cycles/scene/osl.h
+++ b/intern/cycles/scene/osl.h
@@ -54,7 +54,7 @@ struct OSLShaderInfo {

 class OSLShaderManager : public ShaderManager {
 public:
-  OSLShaderManager();
+  OSLShaderManager(Device *device);
  ~OSLShaderManager();

  static void free_memory();
@@ -92,25 +92,22 @@ class OSLShaderManager : public ShaderManager {
                           const std::string &bytecode_hash = "",
                           const std::string &bytecode = "");

- protected:
+ private:
  void texture_system_init();
  void texture_system_free();

  void shading_system_init();
  void shading_system_free();

-  OSL::ShadingSystem *ss;
-  OSL::TextureSystem *ts;
-  OSLRenderServices *services;
-  OSL::ErrorHandler errhandler;
+  Device *device_;
  map<string, OSLShaderInfo> loaded_shaders;

  static OSL::TextureSystem *ts_shared;
  static thread_mutex ts_shared_mutex;
  static int ts_shared_users;

-  static OSL::ShadingSystem *ss_shared;
-  static OSLRenderServices *services_shared;
+  static OSL::ErrorHandler errhandler;
+  static map<int, OSL::ShadingSystem *> ss_shared;
  static thread_mutex ss_shared_mutex;
  static thread_mutex ss_mutex;
  static int ss_shared_users;
@@ -123,10 +120,7 @@ class OSLShaderManager : public ShaderManager {
 class OSLCompiler {
 public:
 #ifdef WITH_OSL
-  OSLCompiler(OSLShaderManager *manager,
-              OSLRenderServices *services,
-              OSL::ShadingSystem *shadingsys,
-              Scene *scene);
+  OSLCompiler(OSLShaderManager *manager, OSL::ShadingSystem *shadingsys, Scene *scene);
 #endif
  void compile(OSLGlobals *og, Shader *shader);

--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@@ -99,11 +99,8 @@ Scene::Scene(const SceneParams &params_, Device *device)
 {
  memset((void *)&dscene.data, 0, sizeof(dscene.data));

-  /* OSL only works on the CPU */
-  if (device->info.has_osl)
-    shader_manager = ShaderManager::create(params.shadingsystem);
-  else
-    shader_manager = ShaderManager::create(SHADINGSYSTEM_SVM);
+  shader_manager = ShaderManager::create(
+      device->info.has_osl ? params.shadingsystem : SHADINGSYSTEM_SVM, device);

  light_manager = new LightManager();
  geometry_manager = new GeometryManager();
--- a/intern/cycles/scene/shader.cpp
+++ b/intern/cycles/scene/shader.cpp
@@ -395,15 +395,16 @@ ShaderManager::~ShaderManager()
 {
 }

-ShaderManager *ShaderManager::create(int shadingsystem)
+ShaderManager *ShaderManager::create(int shadingsystem, Device *device)
 {
  ShaderManager *manager;

  (void)shadingsystem; /* Ignored when built without OSL. */
+  (void)device;

 #ifdef WITH_OSL
  if (shadingsystem == SHADINGSYSTEM_OSL) {
-    manager = new OSLShaderManager();
+    manager = new OSLShaderManager(device);
  }
  else
 #endif
@@ -722,6 +723,10 @@ uint ShaderManager::get_kernel_features(Scene *scene)
    }
  }

+  if (use_osl()) {
+    kernel_features |= KERNEL_FEATURE_OSL;
+  }
+
  return kernel_features;
 }

--- a/intern/cycles/scene/shader.h
+++ b/intern/cycles/scene/shader.h
@@ -170,7 +170,7 @@ class ShaderManager {
    UPDATE_NONE = 0u,
  };

-  static ShaderManager *create(int shadingsystem);
+  static ShaderManager *create(int shadingsystem, Device *device);
  virtual ~ShaderManager();

  virtual void reset(Scene *scene) = 0;
--- a/intern/cycles/scene/shader_nodes.cpp
+++ b/intern/cycles/scene/shader_nodes.cpp
@@ -5132,9 +5132,6 @@ void MixFloatNode::constant_fold(const ConstantFolder &folder)
    }
    folder.make_constant(a * (1 - fac) + b * fac);
  }
-  else {
-    folder.fold_mix_float(use_clamp, false);
-  }
 }

 /* Mix Vector */
@@ -5188,9 +5185,6 @@ void MixVectorNode::constant_fold(const ConstantFolder &folder)
    }
    folder.make_constant(a * (one_float3() - fac) + b * fac);
  }
-  else {
-    folder.fold_mix_color(NODE_MIX_BLEND, use_clamp, false);
-  }
 }

 /* Mix Vector Non Uniform */
--- a/intern/cycles/scene/shader_nodes.h
+++ b/intern/cycles/scene/shader_nodes.h
@@ -1539,6 +1539,10 @@ class OSLNode final : public ShaderNode {
  {
    return true;
  }
+  virtual int get_feature()
+  {
+    return ShaderNode::get_feature() | KERNEL_FEATURE_NODE_RAYTRACE;
+  }

  virtual bool equals(const ShaderNode & /*other*/)
  {
--- a/intern/cycles/test/CMakeLists.txt
+++ b/intern/cycles/test/CMakeLists.txt
@@ -45,17 +45,24 @@ set(SRC
 # Disable AVX tests on macOS. Rosetta has problems running them, and other
 # platforms should be enough to verify AVX operations are implemented correctly.
 if(NOT APPLE)
+  if(CXX_HAS_SSE)
+    list(APPEND SRC
+      util_float8_sse2_test.cpp
+    )
+    set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+  endif()
+
  if(CXX_HAS_AVX)
    list(APPEND SRC
-      util_avxf_avx_test.cpp
+      util_float8_avx_test.cpp
    )
-    set_source_files_properties(util_avxf_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+    set_source_files_properties(util_float8_avx_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
  endif()
  if(CXX_HAS_AVX2)
    list(APPEND SRC
-      util_avxf_avx2_test.cpp
+      util_float8_avx2_test.cpp
    )
-    set_source_files_properties(util_avxf_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+    set_source_files_properties(util_float8_avx2_test.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
  endif()
 endif()

--- a/intern/cycles/test/util_avxf_test.h
+++ b/intern/cycles/test/util_avxf_test.h
@@ -1,211 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2022 Blender Foundation */
-
-#include "testing/testing.h"
-#include "util/system.h"
-#include "util/types.h"
-
-CCL_NAMESPACE_BEGIN
-
-static bool validate_cpu_capabilities()
-{
-
-#ifdef __KERNEL_AVX2__
-  return system_cpu_support_avx2();
-#else
-#  ifdef __KERNEL_AVX__
-  return system_cpu_support_avx();
-#  endif
-#endif
-}
-
-#define INIT_AVX_TEST \
-  if (!validate_cpu_capabilities()) \
-    return; \
-\
-  const avxf avxf_a(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \
-  const avxf avxf_b(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \
-  const avxf avxf_c(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
-
-#define compare_vector_scalar(a, b) \
-  for (size_t index = 0; index < a.size; index++) \
-    EXPECT_FLOAT_EQ(a[index], b);
-
-#define compare_vector_vector(a, b) \
-  for (size_t index = 0; index < a.size; index++) \
-    EXPECT_FLOAT_EQ(a[index], b[index]);
-
-#define compare_vector_vector_near(a, b, abserror) \
-  for (size_t index = 0; index < a.size; index++) \
-    EXPECT_NEAR(a[index], b[index], abserror);
-
-#define basic_test_vv(a, b, op) \
-  INIT_AVX_TEST \
-  avxf c = a op b; \
-  for (size_t i = 0; i < a.size; i++) \
-    EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);
-
-/* vector op float tests */
-#define basic_test_vf(a, b, op) \
-  INIT_AVX_TEST \
-  avxf c = a op b; \
-  for (size_t i = 0; i < a.size; i++) \
-    EXPECT_FLOAT_EQ(c[i], a[i] op b);
-
-static const float float_b = 1.5f;
-
-TEST(TEST_CATEGORY_NAME, avxf_add_vv){basic_test_vv(avxf_a, avxf_b, +)} TEST(TEST_CATEGORY_NAME,
-                                                                             avxf_sub_vv){
-    basic_test_vv(avxf_a, avxf_b, -)} TEST(TEST_CATEGORY_NAME, avxf_mul_vv){
-    basic_test_vv(avxf_a, avxf_b, *)} TEST(TEST_CATEGORY_NAME, avxf_div_vv){
-    basic_test_vv(avxf_a, avxf_b, /)} TEST(TEST_CATEGORY_NAME, avxf_add_vf){
-    basic_test_vf(avxf_a, float_b, +)} TEST(TEST_CATEGORY_NAME, avxf_sub_vf){
-    basic_test_vf(avxf_a, float_b, -)} TEST(TEST_CATEGORY_NAME, avxf_mul_vf){
-    basic_test_vf(avxf_a, float_b, *)} TEST(TEST_CATEGORY_NAME,
-                                            avxf_div_vf){basic_test_vf(avxf_a, float_b, /)}
-
-TEST(TEST_CATEGORY_NAME, avxf_ctor)
-{
-  INIT_AVX_TEST
-  compare_vector_scalar(avxf(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f),
-                        static_cast<float>(index));
-  compare_vector_scalar(avxf(1.0f), 1.0f);
-  compare_vector_vector(avxf(1.0f, 2.0f), avxf(1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f));
-  compare_vector_vector(avxf(1.0f, 2.0f, 3.0f, 4.0f),
-                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f));
-  compare_vector_vector(avxf(make_float3(1.0f, 2.0f, 3.0f)),
-                        avxf(0.0f, 3.0f, 2.0f, 1.0f, 0.0f, 3.0f, 2.0f, 1.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_sqrt)
-{
-  INIT_AVX_TEST
-  compare_vector_vector(mm256_sqrt(avxf(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
-                        avxf(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_min_max)
-{
-  INIT_AVX_TEST
-  compare_vector_vector(min(avxf_a, avxf_b), avxf_a);
-  compare_vector_vector(max(avxf_a, avxf_b), avxf_b);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_set_sign)
-{
-  INIT_AVX_TEST
-  avxf res = set_sign_bit<1, 0, 0, 0, 0, 0, 0, 0>(avxf_a);
-  compare_vector_vector(res, avxf(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, -0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_msub)
-{
-  INIT_AVX_TEST
-  avxf res = msub(avxf_a, avxf_b, avxf_c);
-  avxf exp = avxf((avxf_a[7] * avxf_b[7]) - avxf_c[7],
-                  (avxf_a[6] * avxf_b[6]) - avxf_c[6],
-                  (avxf_a[5] * avxf_b[5]) - avxf_c[5],
-                  (avxf_a[4] * avxf_b[4]) - avxf_c[4],
-                  (avxf_a[3] * avxf_b[3]) - avxf_c[3],
-                  (avxf_a[2] * avxf_b[2]) - avxf_c[2],
-                  (avxf_a[1] * avxf_b[1]) - avxf_c[1],
-                  (avxf_a[0] * avxf_b[0]) - avxf_c[0]);
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_madd)
-{
-  INIT_AVX_TEST
-  avxf res = madd(avxf_a, avxf_b, avxf_c);
-  avxf exp = avxf((avxf_a[7] * avxf_b[7]) + avxf_c[7],
-                  (avxf_a[6] * avxf_b[6]) + avxf_c[6],
-                  (avxf_a[5] * avxf_b[5]) + avxf_c[5],
-                  (avxf_a[4] * avxf_b[4]) + avxf_c[4],
-                  (avxf_a[3] * avxf_b[3]) + avxf_c[3],
-                  (avxf_a[2] * avxf_b[2]) + avxf_c[2],
-                  (avxf_a[1] * avxf_b[1]) + avxf_c[1],
-                  (avxf_a[0] * avxf_b[0]) + avxf_c[0]);
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_nmadd)
-{
-  INIT_AVX_TEST
-  avxf res = nmadd(avxf_a, avxf_b, avxf_c);
-  avxf exp = avxf(avxf_c[7] - (avxf_a[7] * avxf_b[7]),
-                  avxf_c[6] - (avxf_a[6] * avxf_b[6]),
-                  avxf_c[5] - (avxf_a[5] * avxf_b[5]),
-                  avxf_c[4] - (avxf_a[4] * avxf_b[4]),
-                  avxf_c[3] - (avxf_a[3] * avxf_b[3]),
-                  avxf_c[2] - (avxf_a[2] * avxf_b[2]),
-                  avxf_c[1] - (avxf_a[1] * avxf_b[1]),
-                  avxf_c[0] - (avxf_a[0] * avxf_b[0]));
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_compare)
-{
-  INIT_AVX_TEST
-  avxf a(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f);
-  avxf b(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
-  avxb res = a <= b;
-  int exp[8] = {
-      a[0] <= b[0] ? -1 : 0,
-      a[1] <= b[1] ? -1 : 0,
-      a[2] <= b[2] ? -1 : 0,
-      a[3] <= b[3] ? -1 : 0,
-      a[4] <= b[4] ? -1 : 0,
-      a[5] <= b[5] ? -1 : 0,
-      a[6] <= b[6] ? -1 : 0,
-      a[7] <= b[7] ? -1 : 0,
-  };
-  compare_vector_vector(res, exp);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_permute)
-{
-  INIT_AVX_TEST
-  avxf res = permute<3, 0, 1, 7, 6, 5, 2, 4>(avxf_b);
-  compare_vector_vector(res, avxf(4.0f, 6.0f, 3.0f, 2.0f, 1.0f, 7.0f, 8.0f, 5.0f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_blend)
-{
-  INIT_AVX_TEST
-  avxf res = blend<0, 0, 1, 0, 1, 0, 1, 0>(avxf_a, avxf_b);
-  compare_vector_vector(res, avxf(0.1f, 0.2f, 3.0f, 0.4f, 5.0f, 0.6f, 7.0f, 0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_shuffle)
-{
-  INIT_AVX_TEST
-  avxf res = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(avxf_a);
-  compare_vector_vector(res, avxf(0.4f, 0.2f, 0.1f, 0.3f, 0.5f, 0.6f, 0.7f, 0.8f));
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_cross)
-{
-  INIT_AVX_TEST
-  avxf res = cross(avxf_b, avxf_c);
-  compare_vector_vector_near(res,
-                             avxf(0.0f,
-                                  -9.5367432e-07f,
-                                  0.0f,
-                                  4.7683716e-07f,
-                                  0.0f,
-                                  -3.8146973e-06f,
-                                  3.8146973e-06f,
-                                  3.8146973e-06f),
-                             0.000002000f);
-}
-
-TEST(TEST_CATEGORY_NAME, avxf_dot3)
-{
-  INIT_AVX_TEST
-  float den, den2;
-  dot3(avxf_a, avxf_b, den, den2);
-  EXPECT_FLOAT_EQ(den, 14.9f);
-  EXPECT_FLOAT_EQ(den2, 2.9f);
-}
-
-CCL_NAMESPACE_END
--- a/intern/cycles/test/util_float8_avx2_test.cpp
+++ b/intern/cycles/test/util_float8_avx2_test.cpp
@@ -1,11 +1,13 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

+#define __KERNEL_SSE__
+#define __KERNEL_AVX__
 #define __KERNEL_AVX2__

 #define TEST_CATEGORY_NAME util_avx2

 #if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
    defined(__AVX2__)
-#  include "util_avxf_test.h"
+#  include "util_float8_test.h"
 #endif
--- a/intern/cycles/test/util_float8_avx_test.cpp
+++ b/intern/cycles/test/util_float8_avx_test.cpp
@@ -1,11 +1,12 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

+#define __KERNEL_SSE__
 #define __KERNEL_AVX__

 #define TEST_CATEGORY_NAME util_avx

 #if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
    defined(__AVX__)
-#  include "util_avxf_test.h"
+#  include "util_float8_test.h"
 #endif
--- a/intern/cycles/test/util_float8_sse2_test.cpp
+++ b/intern/cycles/test/util_float8_sse2_test.cpp
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#define __KERNEL_SSE__
+#define __KERNEL_SSE2__
+
+#define TEST_CATEGORY_NAME util_sse2
+
+#if (defined(i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)) && \
+    defined(__SSE2__)
+#  include "util_float8_test.h"
+#endif
--- a/intern/cycles/test/util_float8_test.h
+++ b/intern/cycles/test/util_float8_test.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#include "testing/testing.h"
+#include "util/math.h"
+#include "util/system.h"
+#include "util/types.h"
+
+CCL_NAMESPACE_BEGIN
+
+static bool validate_cpu_capabilities()
+{
+
+#if defined(__KERNEL_AVX2__)
+  return system_cpu_support_avx2();
+#elif defined(__KERNEL_AVX__)
+  return system_cpu_support_avx();
+#elif defined(__KERNEL_SSE2__)
+  return system_cpu_support_sse2();
+#else
+  return false;
+#endif
+}
+
+#define INIT_FLOAT8_TEST \
+  if (!validate_cpu_capabilities()) \
+    return; \
+\
+  const vfloat8 float8_a = make_vfloat8(0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f); \
+  const vfloat8 float8_b = make_vfloat8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f); \
+  const vfloat8 float8_c = make_vfloat8(1.1f, 2.2f, 3.3f, 4.4f, 5.5f, 6.6f, 7.7f, 8.8f);
+
+#define compare_vector_scalar(a, b) \
+  for (size_t index = 0; index < 8; index++) \
+    EXPECT_FLOAT_EQ(a[index], b);
+
+#define compare_vector_vector(a, b) \
+  for (size_t index = 0; index < 8; index++) \
+    EXPECT_FLOAT_EQ(a[index], b[index]);
+
+#define compare_vector_vector_near(a, b, abserror) \
+  for (size_t index = 0; index < 8; index++) \
+    EXPECT_NEAR(a[index], b[index], abserror);
+
+#define basic_test_vv(a, b, op) \
+  INIT_FLOAT8_TEST \
+  vfloat8 c = a op b; \
+  for (size_t i = 0; i < 8; i++) \
+    EXPECT_FLOAT_EQ(c[i], a[i] op b[i]);
+
+/* vector op float tests */
+#define basic_test_vf(a, b, op) \
+  INIT_FLOAT8_TEST \
+  vfloat8 c = a op b; \
+  for (size_t i = 0; i < 8; i++) \
+    EXPECT_FLOAT_EQ(c[i], a[i] op b);
+
+static const float float_b = 1.5f;
+
+TEST(TEST_CATEGORY_NAME,
+     float8_add_vv){basic_test_vv(float8_a, float8_b, +)} TEST(TEST_CATEGORY_NAME, float8_sub_vv){
+    basic_test_vv(float8_a, float8_b, -)} TEST(TEST_CATEGORY_NAME, float8_mul_vv){
+    basic_test_vv(float8_a, float8_b, *)} TEST(TEST_CATEGORY_NAME, float8_div_vv){
+    basic_test_vv(float8_a, float8_b, /)} TEST(TEST_CATEGORY_NAME, float8_add_vf){
+    basic_test_vf(float8_a, float_b, +)} TEST(TEST_CATEGORY_NAME, float8_sub_vf){
+    basic_test_vf(float8_a, float_b, -)} TEST(TEST_CATEGORY_NAME, float8_mul_vf){
+    basic_test_vf(float8_a, float_b, *)} TEST(TEST_CATEGORY_NAME,
+                                              float8_div_vf){basic_test_vf(float8_a, float_b, /)}
+
+TEST(TEST_CATEGORY_NAME, float8_ctor)
+{
+  INIT_FLOAT8_TEST
+  compare_vector_scalar(make_vfloat8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f),
+                        static_cast<float>(index));
+  compare_vector_scalar(make_vfloat8(1.0f), 1.0f);
+}
+
+TEST(TEST_CATEGORY_NAME, float8_sqrt)
+{
+  INIT_FLOAT8_TEST
+  compare_vector_vector(sqrt(make_vfloat8(1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f)),
+                        make_vfloat8(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
+}
+
+TEST(TEST_CATEGORY_NAME, float8_min_max)
+{
+  INIT_FLOAT8_TEST
+  compare_vector_vector(min(float8_a, float8_b), float8_a);
+  compare_vector_vector(max(float8_a, float8_b), float8_b);
+}
+
+TEST(TEST_CATEGORY_NAME, float8_shuffle)
+{
+  INIT_FLOAT8_TEST
+  vfloat8 res0 = shuffle<0, 1, 2, 3, 1, 3, 2, 0>(float8_a);
+  compare_vector_vector(res0, make_vfloat8(0.1f, 0.2f, 0.3f, 0.4f, 0.6f, 0.8f, 0.7f, 0.5f));
+  vfloat8 res1 = shuffle<3>(float8_a);
+  compare_vector_vector(res1, make_vfloat8(0.4f, 0.4f, 0.4f, 0.4f, 0.8f, 0.8f, 0.8f, 0.8f));
+  vfloat8 res2 = shuffle<3, 2, 1, 0>(float8_a, float8_b);
+  compare_vector_vector(res2, make_vfloat8(0.4f, 0.3f, 2.0f, 1.0f, 0.8f, 0.7f, 6.0f, 5.0f));
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/util/CMakeLists.txt
+++ b/intern/cycles/util/CMakeLists.txt
@@ -69,6 +69,7 @@ set(SRC_HEADERS
  math_int2.h
  math_int3.h
  math_int4.h
+  math_int8.h
  math_matrix.h
  md5.h
  murmurhash.h
@@ -85,13 +86,7 @@ set(SRC_HEADERS
  rect.h
  set.h
  simd.h
-  avxf.h
-  avxb.h
-  avxi.h
  semaphore.h
-  sseb.h
-  ssef.h
-  ssei.h
  stack_allocator.h
  static_assert.h
  stats.h
@@ -118,6 +113,8 @@ set(SRC_HEADERS
  types_int3_impl.h
  types_int4.h
  types_int4_impl.h
+  types_int8.h
+  types_int8_impl.h
  types_spectrum.h
  types_uchar2.h
  types_uchar2_impl.h
--- a/intern/cycles/util/avxb.h
+++ b/intern/cycles/util/avxb.h
@@ -1,230 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2011-2013 Intel Corporation
- * Modifications Copyright 2014-2022 Blender Foundation. */
-
-#ifndef __UTIL_AVXB_H__
-#define __UTIL_AVXB_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxf;
-
-/*! 4-wide SSE bool type. */
-struct avxb {
-  typedef avxb Mask;   // mask type
-  typedef avxf Float;  // float type
-
-  enum { size = 8 };  // number of SIMD elements
-  union {
-    __m256 m256;
-    int32_t v[8];
-  };  // data
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxb()
-  {
-  }
-  __forceinline avxb(const avxb &other)
-  {
-    m256 = other.m256;
-  }
-  __forceinline avxb &operator=(const avxb &other)
-  {
-    m256 = other.m256;
-    return *this;
-  }
-
-  __forceinline avxb(const __m256 input) : m256(input)
-  {
-  }
-  __forceinline avxb(const __m128 &a, const __m128 &b)
-      : m256(_mm256_insertf128_ps(_mm256_castps128_ps256(a), b, 1))
-  {
-  }
-  __forceinline operator const __m256 &(void) const
-  {
-    return m256;
-  }
-  __forceinline operator const __m256i(void) const
-  {
-    return _mm256_castps_si256(m256);
-  }
-  __forceinline operator const __m256d(void) const
-  {
-    return _mm256_castps_pd(m256);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxb(FalseTy) : m256(_mm256_setzero_ps())
-  {
-  }
-  __forceinline avxb(TrueTy) : m256(_mm256_castsi256_ps(_mm256_set1_epi32(-1)))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline bool operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return (_mm256_movemask_ps(m256) >> i) & 1;
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator!(const avxb &a)
-{
-  return _mm256_xor_ps(a, avxb(True));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator&(const avxb &a, const avxb &b)
-{
-  return _mm256_and_ps(a, b);
-}
-__forceinline const avxb operator|(const avxb &a, const avxb &b)
-{
-  return _mm256_or_ps(a, b);
-}
-__forceinline const avxb operator^(const avxb &a, const avxb &b)
-{
-  return _mm256_xor_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator&=(avxb &a, const avxb &b)
-{
-  return a = a & b;
-}
-__forceinline const avxb operator|=(avxb &a, const avxb &b)
-{
-  return a = a | b;
-}
-__forceinline const avxb operator^=(avxb &a, const avxb &b)
-{
-  return a = a ^ b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb operator!=(const avxb &a, const avxb &b)
-{
-  return _mm256_xor_ps(a, b);
-}
-__forceinline const avxb operator==(const avxb &a, const avxb &b)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a, b));
-#else
-  __m128i a_lo = _mm_castps_si128(_mm256_extractf128_ps(a, 0));
-  __m128i a_hi = _mm_castps_si128(_mm256_extractf128_ps(a, 1));
-  __m128i b_lo = _mm_castps_si128(_mm256_extractf128_ps(b, 0));
-  __m128i b_hi = _mm_castps_si128(_mm256_extractf128_ps(b, 1));
-  __m128i c_lo = _mm_cmpeq_epi32(a_lo, b_lo);
-  __m128i c_hi = _mm_cmpeq_epi32(a_hi, b_hi);
-  __m256i result = _mm256_insertf128_si256(_mm256_castsi128_si256(c_lo), c_hi, 1);
-  return _mm256_castsi256_ps(result);
-#endif
-}
-
-__forceinline const avxb select(const avxb &m, const avxb &t, const avxb &f)
-{
-#if defined(__KERNEL_SSE41__)
-  return _mm256_blendv_ps(f, t, m);
-#else
-  return _mm256_or_ps(_mm256_and_ps(m, t), _mm256_andnot_ps(m, f));
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxb unpacklo(const avxb &a, const avxb &b)
-{
-  return _mm256_unpacklo_ps(a, b);
-}
-__forceinline const avxb unpackhi(const avxb &a, const avxb &b)
-{
-  return _mm256_unpackhi_ps(a, b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reduction Operations
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_SSE41__)
-__forceinline uint32_t popcnt(const avxb &a)
-{
-  return _mm_popcnt_u32(_mm256_movemask_ps(a));
-}
-#else
-__forceinline uint32_t popcnt(const avxb &a)
-{
-  return bool(a[0]) + bool(a[1]) + bool(a[2]) + bool(a[3]) + bool(a[4]) + bool(a[5]) + bool(a[6]) +
-         bool(a[7]);
-}
-#endif
-
-__forceinline bool reduce_and(const avxb &a)
-{
-  return _mm256_movemask_ps(a) == 0xf;
-}
-__forceinline bool reduce_or(const avxb &a)
-{
-  return _mm256_movemask_ps(a) != 0x0;
-}
-__forceinline bool all(const avxb &b)
-{
-  return _mm256_movemask_ps(b) == 0xf;
-}
-__forceinline bool any(const avxb &b)
-{
-  return _mm256_movemask_ps(b) != 0x0;
-}
-__forceinline bool none(const avxb &b)
-{
-  return _mm256_movemask_ps(b) == 0x0;
-}
-
-__forceinline uint32_t movemask(const avxb &a)
-{
-  return _mm256_movemask_ps(a);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Debug Functions
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_avxb(const char *label, const avxb &a)
-{
-  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
-}
-
-CCL_NAMESPACE_END
-
-#endif
--- a/intern/cycles/util/avxf.h
+++ b/intern/cycles/util/avxf.h
@@ -1,379 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2016 Intel Corporation */
-
-#ifndef __UTIL_AVXF_H__
-#define __UTIL_AVXF_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxb;
-
-struct avxf {
-  typedef avxf Float;
-
-  enum { size = 8 }; /* Number of SIMD elements. */
-
-  union {
-    __m256 m256;
-    float f[8];
-    int i[8];
-  };
-
-  __forceinline avxf()
-  {
-  }
-  __forceinline avxf(const avxf &other)
-  {
-    m256 = other.m256;
-  }
-  __forceinline avxf &operator=(const avxf &other)
-  {
-    m256 = other.m256;
-    return *this;
-  }
-
-  __forceinline avxf(const __m256 a) : m256(a)
-  {
-  }
-  __forceinline avxf(const __m256i a) : m256(_mm256_castsi256_ps(a))
-  {
-  }
-
-  __forceinline operator const __m256 &() const
-  {
-    return m256;
-  }
-  __forceinline operator __m256 &()
-  {
-    return m256;
-  }
-
-  __forceinline avxf(float a) : m256(_mm256_set1_ps(a))
-  {
-  }
-
-  __forceinline avxf(float high32x4, float low32x4)
-      : m256(_mm256_set_ps(
-            high32x4, high32x4, high32x4, high32x4, low32x4, low32x4, low32x4, low32x4))
-  {
-  }
-
-  __forceinline avxf(float a3, float a2, float a1, float a0)
-      : m256(_mm256_set_ps(a3, a2, a1, a0, a3, a2, a1, a0))
-  {
-  }
-
-  __forceinline avxf(
-      float a7, float a6, float a5, float a4, float a3, float a2, float a1, float a0)
-      : m256(_mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0))
-  {
-  }
-
-  __forceinline avxf(float3 a) : m256(_mm256_set_ps(a.w, a.z, a.y, a.x, a.w, a.z, a.y, a.x))
-  {
-  }
-
-  __forceinline avxf(int a3, int a2, int a1, int a0)
-  {
-    const __m256i foo = _mm256_set_epi32(a3, a2, a1, a0, a3, a2, a1, a0);
-    m256 = _mm256_castsi256_ps(foo);
-  }
-
-  __forceinline avxf(int a7, int a6, int a5, int a4, int a3, int a2, int a1, int a0)
-  {
-    const __m256i foo = _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
-    m256 = _mm256_castsi256_ps(foo);
-  }
-
-  __forceinline avxf(__m128 a, __m128 b)
-  {
-    const __m256 foo = _mm256_castps128_ps256(a);
-    m256 = _mm256_insertf128_ps(foo, b, 1);
-  }
-
-  __forceinline const float &operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return f[i];
-  }
-  __forceinline float &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return f[i];
-  }
-};
-
-__forceinline avxf cross(const avxf &a, const avxf &b)
-{
-  avxf r(0.0,
-         a[4] * b[5] - a[5] * b[4],
-         a[6] * b[4] - a[4] * b[6],
-         a[5] * b[6] - a[6] * b[5],
-         0.0,
-         a[0] * b[1] - a[1] * b[0],
-         a[2] * b[0] - a[0] * b[2],
-         a[1] * b[2] - a[2] * b[1]);
-  return r;
-}
-
-__forceinline void dot3(const avxf &a, const avxf &b, float &den, float &den2)
-{
-  const avxf t = _mm256_mul_ps(a.m256, b.m256);
-  den = ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2];
-  den2 = ((float *)&t)[4] + ((float *)&t)[5] + ((float *)&t)[6];
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf cast(const __m256i &a)
-{
-  return _mm256_castsi256_ps(a);
-}
-
-__forceinline const avxf mm256_sqrt(const avxf &a)
-{
-  return _mm256_sqrt_ps(a.m256);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf operator+(const avxf &a, const avxf &b)
-{
-  return _mm256_add_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator+(const avxf &a, const float &b)
-{
-  return a + avxf(b);
-}
-__forceinline const avxf operator+(const float &a, const avxf &b)
-{
-  return avxf(a) + b;
-}
-
-__forceinline const avxf operator-(const avxf &a, const avxf &b)
-{
-  return _mm256_sub_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator-(const avxf &a, const float &b)
-{
-  return a - avxf(b);
-}
-__forceinline const avxf operator-(const float &a, const avxf &b)
-{
-  return avxf(a) - b;
-}
-
-__forceinline const avxf operator*(const avxf &a, const avxf &b)
-{
-  return _mm256_mul_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator*(const avxf &a, const float &b)
-{
-  return a * avxf(b);
-}
-__forceinline const avxf operator*(const float &a, const avxf &b)
-{
-  return avxf(a) * b;
-}
-
-__forceinline const avxf operator/(const avxf &a, const avxf &b)
-{
-  return _mm256_div_ps(a.m256, b.m256);
-}
-__forceinline const avxf operator/(const avxf &a, const float &b)
-{
-  return a / avxf(b);
-}
-__forceinline const avxf operator/(const float &a, const avxf &b)
-{
-  return avxf(a) / b;
-}
-
-__forceinline const avxf operator|(const avxf &a, const avxf &b)
-{
-  return _mm256_or_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf operator^(const avxf &a, const avxf &b)
-{
-  return _mm256_xor_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf operator&(const avxf &a, const avxf &b)
-{
-  return _mm256_and_ps(a.m256, b.m256);
-}
-
-__forceinline const avxf max(const avxf &a, const avxf &b)
-{
-  return _mm256_max_ps(a.m256, b.m256);
-}
-__forceinline const avxf min(const avxf &a, const avxf &b)
-{
-  return _mm256_min_ps(a.m256, b.m256);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxf shuffle(const avxf &a, const __m256i &shuf)
-{
-  return _mm256_permutevar_ps(a, shuf);
-}
-
-template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-__forceinline const avxf shuffle(const avxf &a)
-{
-  return _mm256_permutevar_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxf shuffle(const avxf &a, const avxf &b)
-{
-  return _mm256_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
-}
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxf shuffle(const avxf &a)
-{
-  return shuffle<i0, i1, i2, i3>(a, a);
-}
-template<size_t i0> __forceinline const avxf shuffle(const avxf &a, const avxf &b)
-{
-  return shuffle<i0, i0, i0, i0>(a, b);
-}
-template<size_t i0> __forceinline const avxf shuffle(const avxf &a)
-{
-  return shuffle<i0>(a, a);
-}
-
-template<size_t i> __forceinline float extract(const avxf &a)
-{
-  __m256 b = shuffle<i, i, i, i>(a).m256;
-  return _mm256_cvtss_f32(b);
-}
-template<> __forceinline float extract<0>(const avxf &a)
-{
-  return _mm256_cvtss_f32(a.m256);
-}
-
-__forceinline ssef low(const avxf &a)
-{
-  return _mm256_extractf128_ps(a.m256, 0);
-}
-__forceinline ssef high(const avxf &a)
-{
-  return _mm256_extractf128_ps(a.m256, 1);
-}
-
-template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
-__forceinline const avxf permute(const avxf &a)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_permutevar8x32_ps(a, _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0));
-#else
-  float temp[8];
-  _mm256_storeu_ps((float *)&temp, a);
-  return avxf(temp[i7], temp[i6], temp[i5], temp[i4], temp[i3], temp[i2], temp[i1], temp[i0]);
-#endif
-}
-
-template<int S0, int S1, int S2, int S3, int S4, int S5, int S6, int S7>
-ccl_device_inline const avxf set_sign_bit(const avxf &a)
-{
-  return a ^ avxf(S7 << 31, S6 << 31, S5 << 31, S4 << 31, S3 << 31, S2 << 31, S1 << 31, S0 << 31);
-}
-
-template<size_t S0, size_t S1, size_t S2, size_t S3, size_t S4, size_t S5, size_t S6, size_t S7>
-ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
-{
-  return _mm256_blend_ps(
-      a, b, S7 << 0 | S6 << 1 | S5 << 2 | S4 << 3 | S3 << 4 | S2 << 5 | S1 << 6 | S0 << 7);
-}
-
-template<size_t S0, size_t S1, size_t S2, size_t S3>
-ccl_device_inline const avxf blend(const avxf &a, const avxf &b)
-{
-  return blend<S0, S1, S2, S3, S0, S1, S2, S3>(a, b);
-}
-
-//#if defined(__KERNEL_SSE41__)
-__forceinline avxf maxi(const avxf &a, const avxf &b)
-{
-  const avxf ci = _mm256_max_ps(a, b);
-  return ci;
-}
-
-__forceinline avxf mini(const avxf &a, const avxf &b)
-{
-  const avxf ci = _mm256_min_ps(a, b);
-  return ci;
-}
-//#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Ternary Operators
-////////////////////////////////////////////////////////////////////////////////
-__forceinline const avxf madd(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fmadd_ps(a, b, c);
-#else
-  return c + (a * b);
-#endif
-}
-
-__forceinline const avxf nmadd(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fnmadd_ps(a, b, c);
-#else
-  return c - (a * b);
-#endif
-}
-__forceinline const avxf msub(const avxf &a, const avxf &b, const avxf &c)
-{
-#ifdef __KERNEL_AVX2__
-  return _mm256_fmsub_ps(a, b, c);
-#else
-  return (a * b) - c;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-__forceinline const avxb operator<=(const avxf &a, const avxf &b)
-{
-  return _mm256_cmp_ps(a.m256, b.m256, _CMP_LE_OS);
-}
-
-__forceinline const avxf select(const avxb &m, const avxf &t, const avxf &f)
-{
-  return _mm256_blendv_ps(f, t, m);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Common Functions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline avxf mix(const avxf &a, const avxf &b, const avxf &t)
-{
-  return madd(t, b, (avxf(1.0f) - t) * a);
-}
-
-#ifndef _mm256_set_m128
-#  define _mm256_set_m128(/* __m128 */ hi, /* __m128 */ lo) \
-    _mm256_insertf128_ps(_mm256_castps128_ps256(lo), (hi), 0x1)
-#endif
-
-#define _mm256_loadu2_m128(/* float const* */ hiaddr, /* float const* */ loaddr) \
-  _mm256_set_m128(_mm_loadu_ps(hiaddr), _mm_loadu_ps(loaddr))
-
-CCL_NAMESPACE_END
-
-#endif
--- a/intern/cycles/util/avxi.h
+++ b/intern/cycles/util/avxi.h
@@ -1,732 +0,0 @@
-/* SPDX-License-Identifier: Apache-2.0
- * Copyright 2009-2013 Intel Corporation */
-
-#ifndef __UTIL_AVXI_H__
-#define __UTIL_AVXI_H__
-
-CCL_NAMESPACE_BEGIN
-
-struct avxb;
-
-struct avxi {
-  typedef avxb Mask;  // mask type for us
-  enum { size = 8 };  // number of SIMD elements
-  union {             // data
-    __m256i m256;
-#if !defined(__KERNEL_AVX2__)
-    struct {
-      __m128i l, h;
-    };
-#endif
-    int32_t v[8];
-  };
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constructors, Assignment & Cast Operators
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxi()
-  {
-  }
-  __forceinline avxi(const avxi &a)
-  {
-    m256 = a.m256;
-  }
-  __forceinline avxi &operator=(const avxi &a)
-  {
-    m256 = a.m256;
-    return *this;
-  }
-
-  __forceinline avxi(const __m256i a) : m256(a)
-  {
-  }
-  __forceinline operator const __m256i &(void) const
-  {
-    return m256;
-  }
-  __forceinline operator __m256i &(void)
-  {
-    return m256;
-  }
-
-  __forceinline explicit avxi(const ssei &a)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), a, 1))
-  {
-  }
-  __forceinline avxi(const ssei &a, const ssei &b)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
-  {
-  }
-#if defined(__KERNEL_AVX2__)
-  __forceinline avxi(const __m128i &a, const __m128i &b)
-      : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1))
-  {
-  }
-#else
-  __forceinline avxi(const __m128i &a, const __m128i &b) : l(a), h(b)
-  {
-  }
-#endif
-  __forceinline explicit avxi(const int32_t *const a)
-      : m256(_mm256_castps_si256(_mm256_loadu_ps((const float *)a)))
-  {
-  }
-  __forceinline avxi(int32_t a) : m256(_mm256_set1_epi32(a))
-  {
-  }
-  __forceinline avxi(int32_t a, int32_t b) : m256(_mm256_set_epi32(b, a, b, a, b, a, b, a))
-  {
-  }
-  __forceinline avxi(int32_t a, int32_t b, int32_t c, int32_t d)
-      : m256(_mm256_set_epi32(d, c, b, a, d, c, b, a))
-  {
-  }
-  __forceinline avxi(
-      int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, int32_t f, int32_t g, int32_t h)
-      : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a))
-  {
-  }
-
-  __forceinline explicit avxi(const __m256 a) : m256(_mm256_cvtps_epi32(a))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Constants
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline avxi(ZeroTy) : m256(_mm256_setzero_si256())
-  {
-  }
-#if defined(__KERNEL_AVX2__)
-  __forceinline avxi(OneTy) : m256(_mm256_set1_epi32(1))
-  {
-  }
-  __forceinline avxi(PosInfTy) : m256(_mm256_set1_epi32(pos_inf))
-  {
-  }
-  __forceinline avxi(NegInfTy) : m256(_mm256_set1_epi32(neg_inf))
-  {
-  }
-#else
-  __forceinline avxi(OneTy) : m256(_mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1))
-  {
-  }
-  __forceinline avxi(PosInfTy)
-      : m256(_mm256_set_epi32(
-            pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf, pos_inf))
-  {
-  }
-  __forceinline avxi(NegInfTy)
-      : m256(_mm256_set_epi32(
-            neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf, neg_inf))
-  {
-  }
-#endif
-  __forceinline avxi(StepTy) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))
-  {
-  }
-
-  ////////////////////////////////////////////////////////////////////////////////
-  /// Array Access
-  ////////////////////////////////////////////////////////////////////////////////
-
-  __forceinline const int32_t &operator[](const size_t i) const
-  {
-    assert(i < 8);
-    return v[i];
-  }
-  __forceinline int32_t &operator[](const size_t i)
-  {
-    assert(i < 8);
-    return v[i];
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// Unary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxi cast(const __m256 &a)
-{
-  return _mm256_castps_si256(a);
-}
-__forceinline const avxi operator+(const avxi &a)
-{
-  return a;
-}
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator-(const avxi &a)
-{
-  return _mm256_sub_epi32(_mm256_setzero_si256(), a.m256);
-}
-__forceinline const avxi abs(const avxi &a)
-{
-  return _mm256_abs_epi32(a.m256);
-}
-#else
-__forceinline const avxi operator-(const avxi &a)
-{
-  return avxi(_mm_sub_epi32(_mm_setzero_si128(), a.l), _mm_sub_epi32(_mm_setzero_si128(), a.h));
-}
-__forceinline const avxi abs(const avxi &a)
-{
-  return avxi(_mm_abs_epi32(a.l), _mm_abs_epi32(a.h));
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-/// Binary Operators
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator+(const avxi &a, const avxi &b)
-{
-  return _mm256_add_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator+(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_add_epi32(a.l, b.l), _mm_add_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator+(const avxi &a, const int32_t b)
-{
-  return a + avxi(b);
-}
-__forceinline const avxi operator+(const int32_t a, const avxi &b)
-{
-  return avxi(a) + b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator-(const avxi &a, const avxi &b)
-{
-  return _mm256_sub_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator-(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_sub_epi32(a.l, b.l), _mm_sub_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator-(const avxi &a, const int32_t b)
-{
-  return a - avxi(b);
-}
-__forceinline const avxi operator-(const int32_t a, const avxi &b)
-{
-  return avxi(a) - b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator*(const avxi &a, const avxi &b)
-{
-  return _mm256_mullo_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator*(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_mullo_epi32(a.l, b.l), _mm_mullo_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi operator*(const avxi &a, const int32_t b)
-{
-  return a * avxi(b);
-}
-__forceinline const avxi operator*(const int32_t a, const avxi &b)
-{
-  return avxi(a) * b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator&(const avxi &a, const avxi &b)
-{
-  return _mm256_and_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator&(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator&(const avxi &a, const int32_t b)
-{
-  return a & avxi(b);
-}
-__forceinline const avxi operator&(const int32_t a, const avxi &b)
-{
-  return avxi(a) & b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator|(const avxi &a, const avxi &b)
-{
-  return _mm256_or_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator|(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator|(const avxi &a, const int32_t b)
-{
-  return a | avxi(b);
-}
-__forceinline const avxi operator|(const int32_t a, const avxi &b)
-{
-  return avxi(a) | b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator^(const avxi &a, const avxi &b)
-{
-  return _mm256_xor_si256(a.m256, b.m256);
-}
-#else
-__forceinline const avxi operator^(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-__forceinline const avxi operator^(const avxi &a, const int32_t b)
-{
-  return a ^ avxi(b);
-}
-__forceinline const avxi operator^(const int32_t a, const avxi &b)
-{
-  return avxi(a) ^ b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi operator<<(const avxi &a, const int32_t n)
-{
-  return _mm256_slli_epi32(a.m256, n);
-}
-__forceinline const avxi operator>>(const avxi &a, const int32_t n)
-{
-  return _mm256_srai_epi32(a.m256, n);
-}
-
-__forceinline const avxi sra(const avxi &a, const int32_t b)
-{
-  return _mm256_srai_epi32(a.m256, b);
-}
-__forceinline const avxi srl(const avxi &a, const int32_t b)
-{
-  return _mm256_srli_epi32(a.m256, b);
-}
-#else
-__forceinline const avxi operator<<(const avxi &a, const int32_t n)
-{
-  return avxi(_mm_slli_epi32(a.l, n), _mm_slli_epi32(a.h, n));
-}
-__forceinline const avxi operator>>(const avxi &a, const int32_t n)
-{
-  return avxi(_mm_srai_epi32(a.l, n), _mm_srai_epi32(a.h, n));
-}
-
-__forceinline const avxi sra(const avxi &a, const int32_t b)
-{
-  return avxi(_mm_srai_epi32(a.l, b), _mm_srai_epi32(a.h, b));
-}
-__forceinline const avxi srl(const avxi &a, const int32_t b)
-{
-  return avxi(_mm_srli_epi32(a.l, b), _mm_srli_epi32(a.h, b));
-}
-#endif
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi min(const avxi &a, const avxi &b)
-{
-  return _mm256_min_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi min(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_min_epi32(a.l, b.l), _mm_min_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi min(const avxi &a, const int32_t b)
-{
-  return min(a, avxi(b));
-}
-__forceinline const avxi min(const int32_t a, const avxi &b)
-{
-  return min(avxi(a), b);
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxi max(const avxi &a, const avxi &b)
-{
-  return _mm256_max_epi32(a.m256, b.m256);
-}
-#else
-__forceinline const avxi max(const avxi &a, const avxi &b)
-{
-  return avxi(_mm_max_epi32(a.l, b.l), _mm_max_epi32(a.h, b.h));
-}
-#endif
-__forceinline const avxi max(const avxi &a, const int32_t b)
-{
-  return max(a, avxi(b));
-}
-__forceinline const avxi max(const int32_t a, const avxi &b)
-{
-  return max(avxi(a), b);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Assignment Operators
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline avxi &operator+=(avxi &a, const avxi &b)
-{
-  return a = a + b;
-}
-__forceinline avxi &operator+=(avxi &a, const int32_t b)
-{
-  return a = a + b;
-}
-
-__forceinline avxi &operator-=(avxi &a, const avxi &b)
-{
-  return a = a - b;
-}
-__forceinline avxi &operator-=(avxi &a, const int32_t b)
-{
-  return a = a - b;
-}
-
-__forceinline avxi &operator*=(avxi &a, const avxi &b)
-{
-  return a = a * b;
-}
-__forceinline avxi &operator*=(avxi &a, const int32_t b)
-{
-  return a = a * b;
-}
-
-__forceinline avxi &operator&=(avxi &a, const avxi &b)
-{
-  return a = a & b;
-}
-__forceinline avxi &operator&=(avxi &a, const int32_t b)
-{
-  return a = a & b;
-}
-
-__forceinline avxi &operator|=(avxi &a, const avxi &b)
-{
-  return a = a | b;
-}
-__forceinline avxi &operator|=(avxi &a, const int32_t b)
-{
-  return a = a | b;
-}
-
-__forceinline avxi &operator^=(avxi &a, const avxi &b)
-{
-  return a = a ^ b;
-}
-__forceinline avxi &operator^=(avxi &a, const int32_t b)
-{
-  return a = a ^ b;
-}
-
-__forceinline avxi &operator<<=(avxi &a, const int32_t b)
-{
-  return a = a << b;
-}
-__forceinline avxi &operator>>=(avxi &a, const int32_t b)
-{
-  return a = a >> b;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Comparison Operators + Select
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator==(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(a.m256, b.m256));
-}
-#else
-__forceinline const avxb operator==(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmpeq_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmpeq_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator==(const avxi &a, const int32_t b)
-{
-  return a == avxi(b);
-}
-__forceinline const avxb operator==(const int32_t a, const avxi &b)
-{
-  return avxi(a) == b;
-}
-
-__forceinline const avxb operator!=(const avxi &a, const avxi &b)
-{
-  return !(a == b);
-}
-__forceinline const avxb operator!=(const avxi &a, const int32_t b)
-{
-  return a != avxi(b);
-}
-__forceinline const avxb operator!=(const int32_t a, const avxi &b)
-{
-  return avxi(a) != b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator<(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(b.m256, a.m256));
-}
-#else
-__forceinline const avxb operator<(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmplt_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmplt_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator<(const avxi &a, const int32_t b)
-{
-  return a < avxi(b);
-}
-__forceinline const avxb operator<(const int32_t a, const avxi &b)
-{
-  return avxi(a) < b;
-}
-
-__forceinline const avxb operator>=(const avxi &a, const avxi &b)
-{
-  return !(a < b);
-}
-__forceinline const avxb operator>=(const avxi &a, const int32_t b)
-{
-  return a >= avxi(b);
-}
-__forceinline const avxb operator>=(const int32_t a, const avxi &b)
-{
-  return avxi(a) >= b;
-}
-
-#if defined(__KERNEL_AVX2__)
-__forceinline const avxb operator>(const avxi &a, const avxi &b)
-{
-  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(a.m256, b.m256));
-}
-#else
-__forceinline const avxb operator>(const avxi &a, const avxi &b)
-{
-  return avxb(_mm_castsi128_ps(_mm_cmpgt_epi32(a.l, b.l)),
-              _mm_castsi128_ps(_mm_cmpgt_epi32(a.h, b.h)));
-}
-#endif
-__forceinline const avxb operator>(const avxi &a, const int32_t b)
-{
-  return a > avxi(b);
-}
-__forceinline const avxb operator>(const int32_t a, const avxi &b)
-{
-  return avxi(a) > b;
-}
-
-__forceinline const avxb operator<=(const avxi &a, const avxi &b)
-{
-  return !(a > b);
-}
-__forceinline const avxb operator<=(const avxi &a, const int32_t b)
-{
-  return a <= avxi(b);
-}
-__forceinline const avxb operator<=(const int32_t a, const avxi &b)
-{
-  return avxi(a) <= b;
-}
-
-__forceinline const avxi select(const avxb &m, const avxi &t, const avxi &f)
-{
-  return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), m));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Movement/Shifting/Shuffling Functions
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__KERNEL_AVX2__)
-__forceinline avxi unpacklo(const avxi &a, const avxi &b)
-{
-  return _mm256_unpacklo_epi32(a.m256, b.m256);
-}
-__forceinline avxi unpackhi(const avxi &a, const avxi &b)
-{
-  return _mm256_unpackhi_epi32(a.m256, b.m256);
-}
-#else
-__forceinline avxi unpacklo(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-__forceinline avxi unpackhi(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-}
-#endif
-
-template<size_t i> __forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i, i, i, i)));
-}
-
-template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_permute2f128_si256(a, a, (i1 << 4) | (i0 << 0));
-}
-
-template<size_t i0, size_t i1> __forceinline const avxi shuffle(const avxi &a, const avxi &b)
-{
-  return _mm256_permute2f128_si256(a, b, (i1 << 4) | (i0 << 0));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxi shuffle(const avxi &a)
-{
-  return _mm256_castps_si256(
-      _mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(i3, i2, i1, i0)));
-}
-
-template<size_t i0, size_t i1, size_t i2, size_t i3>
-__forceinline const avxi shuffle(const avxi &a, const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_shuffle_ps(
-      _mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(i3, i2, i1, i0)));
-}
-
-template<> __forceinline const avxi shuffle<0, 0, 2, 2>(const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b)));
-}
-template<> __forceinline const avxi shuffle<1, 1, 3, 3>(const avxi &b)
-{
-  return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b)));
-}
-template<> __forceinline const avxi shuffle<0, 1, 0, 1>(const avxi &b)
-{
-  return _mm256_castps_si256(
-      _mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b)))));
-}
-
-__forceinline const avxi broadcast(const int *ptr)
-{
-  return _mm256_castps_si256(_mm256_broadcast_ss((const float *)ptr));
-}
-template<size_t i> __forceinline const avxi insert(const avxi &a, const ssei &b)
-{
-  return _mm256_insertf128_si256(a, b, i);
-}
-template<size_t i> __forceinline const ssei extract(const avxi &a)
-{
-  return _mm256_extractf128_si256(a, i);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Reductions
-////////////////////////////////////////////////////////////////////////////////
-
-__forceinline const avxi vreduce_min2(const avxi &v)
-{
-  return min(v, shuffle<1, 0, 3, 2>(v));
-}
-__forceinline const avxi vreduce_min4(const avxi &v)
-{
-  avxi v1 = vreduce_min2(v);
-  return min(v1, shuffle<2, 3, 0, 1>(v1));
-}
-__forceinline const avxi vreduce_min(const avxi &v)
-{
-  avxi v1 = vreduce_min4(v);
-  return min(v1, shuffle<1, 0>(v1));
-}
-
-__forceinline const avxi vreduce_max2(const avxi &v)
-{
-  return max(v, shuffle<1, 0, 3, 2>(v));
-}
-__forceinline const avxi vreduce_max4(const avxi &v)
-{
-  avxi v1 = vreduce_max2(v);
-  return max(v1, shuffle<2, 3, 0, 1>(v1));
-}
-__forceinline const avxi vreduce_max(const avxi &v)
-{
-  avxi v1 = vreduce_max4(v);
-  return max(v1, shuffle<1, 0>(v1));
-}
-
-__forceinline const avxi vreduce_add2(const avxi &v)
-{
-  return v + shuffle<1, 0, 3, 2>(v);
-}
-__forceinline const avxi vreduce_add4(const avxi &v)
-{
-  avxi v1 = vreduce_add2(v);
-  return v1 + shuffle<2, 3, 0, 1>(v1);
-}
-__forceinline const avxi vreduce_add(const avxi &v)
-{
-  avxi v1 = vreduce_add4(v);
-  return v1 + shuffle<1, 0>(v1);
-}
-
-__forceinline int reduce_min(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_min(v)));
-}
-__forceinline int reduce_max(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_max(v)));
-}
-__forceinline int reduce_add(const avxi &v)
-{
-  return extract<0>(extract<0>(vreduce_add(v)));
-}
-
-__forceinline uint32_t select_min(const avxi &v)
-{
-  return __bsf(movemask(v == vreduce_min(v)));
-}
-__forceinline uint32_t select_max(const avxi &v)
-{
-  return __bsf(movemask(v == vreduce_max(v)));
-}
-
-__forceinline uint32_t select_min(const avxb &valid, const avxi &v)
-{
-  const avxi a = select(valid, v, avxi(pos_inf));
-  return __bsf(movemask(valid & (a == vreduce_min(a))));
-}
-__forceinline uint32_t select_max(const avxb &valid, const avxi &v)
-{
-  const avxi a = select(valid, v, avxi(neg_inf));
-  return __bsf(movemask(valid & (a == vreduce_max(a))));
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/// Output Operators
-////////////////////////////////////////////////////////////////////////////////
-
-ccl_device_inline void print_avxi(const char *label, const avxi &a)
-{
-  printf("%s: %d %d %d %d %d %d %d %d\n", label, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
-}
-
-CCL_NAMESPACE_END
-
-#endif
--- a/intern/cycles/util/color.h
+++ b/intern/cycles/util/color.h
@@ -228,28 +228,27 @@ ccl_device float3 xyY_to_xyz(float x, float y, float Y)
 * exp = exponent, encoded as uint32_t
 * e2coeff = 2^(127/exponent - 127) * bias_coeff^(1/exponent), encoded as uint32_t
 */
-template<unsigned exp, unsigned e2coeff> ccl_device_inline ssef fastpow(const ssef &arg)
+template<unsigned exp, unsigned e2coeff> ccl_device_inline float4 fastpow(const float4 &arg)
 {
-  ssef ret;
-  ret = arg * cast(ssei(e2coeff));
-  ret = ssef(cast(ret));
-  ret = ret * cast(ssei(exp));
-  ret = cast(ssei(ret));
+  float4 ret = arg * cast(make_int4(e2coeff));
+  ret = make_float4(cast(ret));
+  ret = ret * cast(make_int4(exp));
+  ret = cast(make_int4(ret));
  return ret;
 }

 /* Improve x ^ 1.0f/5.0f solution with Newton-Raphson method */
-ccl_device_inline ssef improve_5throot_solution(const ssef &old_result, const ssef &x)
+ccl_device_inline float4 improve_5throot_solution(const float4 &old_result, const float4 &x)
 {
-  ssef approx2 = old_result * old_result;
-  ssef approx4 = approx2 * approx2;
-  ssef t = x / approx4;
-  ssef summ = madd(ssef(4.0f), old_result, t);
-  return summ * ssef(1.0f / 5.0f);
+  float4 approx2 = old_result * old_result;
+  float4 approx4 = approx2 * approx2;
+  float4 t = x / approx4;
+  float4 summ = madd(make_float4(4.0f), old_result, t);
+  return summ * make_float4(1.0f / 5.0f);
 }

 /* Calculate powf(x, 2.4). Working domain: 1e-10 < x < 1e+10 */
-ccl_device_inline ssef fastpow24(const ssef &arg)
+ccl_device_inline float4 fastpow24(const float4 &arg)
 {
  /* max, avg and |avg| errors were calculated in gcc without FMA instructions
   * The final precision should be better than powf in glibc */
@@ -257,9 +256,10 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
  /* Calculate x^4/5, coefficient 0.994 was constructed manually to minimize avg error */
  /* 0x3F4CCCCD = 4/5 */
  /* 0x4F55A7FB = 2^(127/(4/5) - 127) * 0.994^(1/(4/5)) */
-  ssef x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(arg);  // error max = 0.17  avg = 0.0018    |avg| = 0.05
-  ssef arg2 = arg * arg;
-  ssef arg4 = arg2 * arg2;
+  float4 x = fastpow<0x3F4CCCCD, 0x4F55A7FB>(
+      arg);  // error max = 0.17  avg = 0.0018    |avg| = 0.05
+  float4 arg2 = arg * arg;
+  float4 arg4 = arg2 * arg2;

  /* error max = 0.018     avg = 0.0031    |avg| = 0.0031 */
  x = improve_5throot_solution(x, arg4);
@@ -271,12 +271,12 @@ ccl_device_inline ssef fastpow24(const ssef &arg)
  return x * (x * x);
 }

-ccl_device ssef color_srgb_to_linear(const ssef &c)
+ccl_device float4 color_srgb_to_linear(const float4 &c)
 {
-  sseb cmp = c < ssef(0.04045f);
-  ssef lt = max(c * ssef(1.0f / 12.92f), ssef(0.0f));
-  ssef gtebase = (c + ssef(0.055f)) * ssef(1.0f / 1.055f); /* fma */
-  ssef gte = fastpow24(gtebase);
+  int4 cmp = c < make_float4(0.04045f);
+  float4 lt = max(c * make_float4(1.0f / 12.92f), make_float4(0.0f));
+  float4 gtebase = (c + make_float4(0.055f)) * make_float4(1.0f / 1.055f); /* fma */
+  float4 gte = fastpow24(gtebase);
  return select(cmp, lt, gte);
 }
 #endif /* __KERNEL_SSE2__ */
@@ -302,10 +302,8 @@ ccl_device float4 color_linear_to_srgb_v4(float4 c)
 ccl_device float4 color_srgb_to_linear_v4(float4 c)
 {
 #ifdef __KERNEL_SSE2__
-  ssef r_ssef;
-  float4 &r = (float4 &)r_ssef;
-  r = c;
-  r_ssef = color_srgb_to_linear(r_ssef);
+  float4 r = c;
+  r = color_srgb_to_linear(r);
  r.w = c.w;
  return r;
 #else
--- a/intern/cycles/util/defines.h
+++ b/intern/cycles/util/defines.h
@@ -23,6 +23,7 @@
 /* Leave inlining decisions to compiler for these, the inline keyword here
 * is not about performance but including function definitions in headers. */
 #  define ccl_device static inline
+#  define ccl_device_extern extern "C"
 #  define ccl_device_noinline static inline
 #  define ccl_device_noinline_cpu ccl_device_noinline

--- a/intern/cycles/util/half.h
+++ b/intern/cycles/util/half.h
@@ -154,17 +154,17 @@ ccl_device_inline half float_to_half_display(const float f)

 ccl_device_inline half4 float4_to_half4_display(const float4 f)
 {
-#ifdef __KERNEL_SSE2__
+#ifdef __KERNEL_SSE__
  /* CPU: SSE and AVX. */
-  ssef x = min(max(load4f(f), 0.0f), 65504.0f);
+  float4 x = min(max(f, make_float4(0.0f)), make_float4(65504.0f));
 #  ifdef __KERNEL_AVX2__
-  ssei rpack = _mm_cvtps_ph(x, 0);
+  int4 rpack = int4(_mm_cvtps_ph(x, 0));
 #  else
-  ssei absolute = cast(x) & 0x7FFFFFFF;
-  ssei Z = absolute + 0xC8000000;
-  ssei result = andnot(absolute < 0x38800000, Z);
-  ssei rshift = (result >> 13) & 0x7FFF;
-  ssei rpack = _mm_packs_epi32(rshift, rshift);
+  int4 absolute = cast(x) & make_int4(0x7FFFFFFF);
+  int4 Z = absolute + make_int4(0xC8000000);
+  int4 result = andnot(absolute < make_int4(0x38800000), Z);
+  int4 rshift = (result >> 13) & make_int4(0x7FFF);
+  int4 rpack = int4(_mm_packs_epi32(rshift, rshift));
 #  endif
  half4 h;
  _mm_storel_pi((__m64 *)&h, _mm_castsi128_ps(rpack));
--- a/intern/cycles/util/hash.h
+++ b/intern/cycles/util/hash.h
@@ -222,7 +222,7 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k)

 /* SSE Versions Of Jenkins Lookup3 Hash Functions */

-#ifdef __KERNEL_SSE2__
+#ifdef __KERNEL_SSE__
 #  define rot(x, k) (((x) << (k)) | (srl(x, 32 - (k))))

 #  define mix(a, b, c) \
@@ -265,10 +265,10 @@ ccl_device_inline float3 hash_float4_to_float3(float4 k)
      c -= rot(b, 24); \
    }

-ccl_device_inline ssei hash_ssei(ssei kx)
+ccl_device_inline int4 hash_int4(int4 kx)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (1 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (1 << 2) + 13);

  a += kx;
  final(a, b, c);
@@ -276,10 +276,10 @@ ccl_device_inline ssei hash_ssei(ssei kx)
  return c;
 }

-ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
+ccl_device_inline int4 hash_int4_2(int4 kx, int4 ky)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (2 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (2 << 2) + 13);

  b += ky;
  a += kx;
@@ -288,10 +288,10 @@ ccl_device_inline ssei hash_ssei2(ssei kx, ssei ky)
  return c;
 }

-ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
+ccl_device_inline int4 hash_int4_3(int4 kx, int4 ky, int4 kz)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (3 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (3 << 2) + 13);

  c += kz;
  b += ky;
@@ -301,10 +301,10 @@ ccl_device_inline ssei hash_ssei3(ssei kx, ssei ky, ssei kz)
  return c;
 }

-ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
+ccl_device_inline int4 hash_int4_4(int4 kx, int4 ky, int4 kz, int4 kw)
 {
-  ssei a, b, c;
-  a = b = c = ssei(0xdeadbeef + (4 << 2) + 13);
+  int4 a, b, c;
+  a = b = c = make_int4(0xdeadbeef + (4 << 2) + 13);

  a += kx;
  b += ky;
@@ -317,11 +317,11 @@ ccl_device_inline ssei hash_ssei4(ssei kx, ssei ky, ssei kz, ssei kw)
  return c;
 }

-#  if defined(__KERNEL_AVX__)
-ccl_device_inline avxi hash_avxi(avxi kx)
+#  if defined(__KERNEL_AVX2__)
+ccl_device_inline vint8 hash_int8(vint8 kx)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (1 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (1 << 2) + 13);

  a += kx;
  final(a, b, c);
@@ -329,10 +329,10 @@ ccl_device_inline avxi hash_avxi(avxi kx)
  return c;
 }

-ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
+ccl_device_inline vint8 hash_int8_2(vint8 kx, vint8 ky)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (2 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (2 << 2) + 13);

  b += ky;
  a += kx;
@@ -341,10 +341,10 @@ ccl_device_inline avxi hash_avxi2(avxi kx, avxi ky)
  return c;
 }

-ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
+ccl_device_inline vint8 hash_int8_3(vint8 kx, vint8 ky, vint8 kz)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (3 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (3 << 2) + 13);

  c += kz;
  b += ky;
@@ -354,10 +354,10 @@ ccl_device_inline avxi hash_avxi3(avxi kx, avxi ky, avxi kz)
  return c;
 }

-ccl_device_inline avxi hash_avxi4(avxi kx, avxi ky, avxi kz, avxi kw)
+ccl_device_inline vint8 hash_int8_4(vint8 kx, vint8 ky, vint8 kz, vint8 kw)
 {
-  avxi a, b, c;
-  a = b = c = avxi(0xdeadbeef + (4 << 2) + 13);
+  vint8 a, b, c;
+  a = b = c = make_vint8(0xdeadbeef + (4 << 2) + 13);

  a += kx;
  b += ky;
--- a/intern/cycles/util/math.h
+++ b/intern/cycles/util/math.h
@@ -532,12 +532,14 @@ CCL_NAMESPACE_END
 #include "util/math_int2.h"
 #include "util/math_int3.h"
 #include "util/math_int4.h"
+#include "util/math_int8.h"

 #include "util/math_float2.h"
-#include "util/math_float3.h"
 #include "util/math_float4.h"
 #include "util/math_float8.h"

+#include "util/math_float3.h"
+
 #include "util/rect.h"

 CCL_NAMESPACE_BEGIN
--- a/intern/cycles/util/math_float2.h
+++ b/intern/cycles/util/math_float2.h
@@ -10,55 +10,6 @@

 CCL_NAMESPACE_BEGIN

-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float2 operator-(const float2 &a);
-ccl_device_inline float2 operator*(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator*(const float2 &a, float f);
-ccl_device_inline float2 operator*(float f, const float2 &a);
-ccl_device_inline float2 operator/(float f, const float2 &a);
-ccl_device_inline float2 operator/(const float2 &a, float f);
-ccl_device_inline float2 operator/(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator+(const float2 &a, const float f);
-ccl_device_inline float2 operator+(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator-(const float2 &a, const float f);
-ccl_device_inline float2 operator-(const float2 &a, const float2 &b);
-ccl_device_inline float2 operator+=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator*=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator*=(float2 &a, float f);
-ccl_device_inline float2 operator/=(float2 &a, const float2 &b);
-ccl_device_inline float2 operator/=(float2 &a, float f);
-
-ccl_device_inline bool operator==(const float2 &a, const float2 &b);
-ccl_device_inline bool operator!=(const float2 &a, const float2 &b);
-
-ccl_device_inline bool is_zero(const float2 &a);
-ccl_device_inline float average(const float2 &a);
-ccl_device_inline float distance(const float2 &a, const float2 &b);
-ccl_device_inline float dot(const float2 &a, const float2 &b);
-ccl_device_inline float cross(const float2 &a, const float2 &b);
-ccl_device_inline float len(const float2 a);
-ccl_device_inline float2 normalize(const float2 &a);
-ccl_device_inline float2 normalize_len(const float2 &a, float *t);
-ccl_device_inline float2 safe_normalize(const float2 &a);
-ccl_device_inline float2 min(const float2 &a, const float2 &b);
-ccl_device_inline float2 max(const float2 &a, const float2 &b);
-ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx);
-ccl_device_inline float2 fabs(const float2 &a);
-ccl_device_inline float2 as_float2(const float4 &a);
-ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t);
-ccl_device_inline float2 floor(const float2 &a);
-#endif /* !__KERNEL_METAL__ */
-
-ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b);
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float2 zero_float2()
 {
  return make_float2(0.0f, 0.0f);
@@ -75,63 +26,63 @@ ccl_device_inline float2 operator-(const float2 &a)
  return make_float2(-a.x, -a.y);
 }

-ccl_device_inline float2 operator*(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator*(const float2 a, const float2 b)
 {
  return make_float2(a.x * b.x, a.y * b.y);
 }

-ccl_device_inline float2 operator*(const float2 &a, float f)
+ccl_device_inline float2 operator*(const float2 a, float f)
 {
  return make_float2(a.x * f, a.y * f);
 }

-ccl_device_inline float2 operator*(float f, const float2 &a)
+ccl_device_inline float2 operator*(float f, const float2 a)
 {
  return make_float2(a.x * f, a.y * f);
 }

-ccl_device_inline float2 operator/(float f, const float2 &a)
+ccl_device_inline float2 operator/(float f, const float2 a)
 {
  return make_float2(f / a.x, f / a.y);
 }

-ccl_device_inline float2 operator/(const float2 &a, float f)
+ccl_device_inline float2 operator/(const float2 a, float f)
 {
  float invf = 1.0f / f;
  return make_float2(a.x * invf, a.y * invf);
 }

-ccl_device_inline float2 operator/(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator/(const float2 a, const float2 b)
 {
  return make_float2(a.x / b.x, a.y / b.y);
 }

-ccl_device_inline float2 operator+(const float2 &a, const float f)
-{
-  return a + make_float2(f, f);
-}
-
-ccl_device_inline float2 operator+(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator+(const float2 a, const float2 b)
 {
  return make_float2(a.x + b.x, a.y + b.y);
 }

-ccl_device_inline float2 operator-(const float2 &a, const float f)
+ccl_device_inline float2 operator+(const float2 a, const float f)
 {
-  return a - make_float2(f, f);
+  return a + make_float2(f, f);
 }

-ccl_device_inline float2 operator-(const float2 &a, const float2 &b)
+ccl_device_inline float2 operator-(const float2 a, const float2 b)
 {
  return make_float2(a.x - b.x, a.y - b.y);
 }

-ccl_device_inline float2 operator+=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator-(const float2 a, const float f)
+{
+  return a - make_float2(f, f);
+}
+
+ccl_device_inline float2 operator+=(float2 &a, const float2 b)
 {
  return a = a + b;
 }

-ccl_device_inline float2 operator*=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator*=(float2 &a, const float2 b)
 {
  return a = a * b;
 }
@@ -141,7 +92,7 @@ ccl_device_inline float2 operator*=(float2 &a, float f)
  return a = a * f;
 }

-ccl_device_inline float2 operator/=(float2 &a, const float2 &b)
+ccl_device_inline float2 operator/=(float2 &a, const float2 b)
 {
  return a = a / b;
 }
@@ -152,74 +103,81 @@ ccl_device_inline float2 operator/=(float2 &a, float f)
  return a = a * invf;
 }

-ccl_device_inline bool operator==(const float2 &a, const float2 &b)
+ccl_device_inline bool operator==(const float2 a, const float2 b)
 {
  return (a.x == b.x && a.y == b.y);
 }

-ccl_device_inline bool operator!=(const float2 &a, const float2 &b)
+ccl_device_inline bool operator!=(const float2 a, const float2 b)
 {
  return !(a == b);
 }

-ccl_device_inline bool is_zero(const float2 &a)
+ccl_device_inline bool is_zero(const float2 a)
 {
  return (a.x == 0.0f && a.y == 0.0f);
 }

-ccl_device_inline float average(const float2 &a)
+ccl_device_inline float average(const float2 a)
 {
  return (a.x + a.y) * (1.0f / 2.0f);
 }

-ccl_device_inline float distance(const float2 &a, const float2 &b)
+ccl_device_inline float dot(const float2 a, const float2 b)
+{
+  return a.x * b.x + a.y * b.y;
+}
+#endif
+
+ccl_device_inline float len(const float2 a)
+{
+  return sqrtf(dot(a, a));
+}
+
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float2 a, const float2 b)
 {
  return len(a - b);
 }

-ccl_device_inline float dot(const float2 &a, const float2 &b)
-{
-  return a.x * b.x + a.y * b.y;
-}
-
-ccl_device_inline float cross(const float2 &a, const float2 &b)
+ccl_device_inline float cross(const float2 a, const float2 b)
 {
  return (a.x * b.y - a.y * b.x);
 }

-ccl_device_inline float2 normalize(const float2 &a)
+ccl_device_inline float2 normalize(const float2 a)
 {
  return a / len(a);
 }

-ccl_device_inline float2 normalize_len(const float2 &a, ccl_private float *t)
+ccl_device_inline float2 normalize_len(const float2 a, ccl_private float *t)
 {
  *t = len(a);
  return a / (*t);
 }

-ccl_device_inline float2 safe_normalize(const float2 &a)
+ccl_device_inline float2 safe_normalize(const float2 a)
 {
  float t = len(a);
  return (t != 0.0f) ? a / t : a;
 }

-ccl_device_inline float2 min(const float2 &a, const float2 &b)
+ccl_device_inline float2 min(const float2 a, const float2 b)
 {
  return make_float2(min(a.x, b.x), min(a.y, b.y));
 }

-ccl_device_inline float2 max(const float2 &a, const float2 &b)
+ccl_device_inline float2 max(const float2 a, const float2 b)
 {
  return make_float2(max(a.x, b.x), max(a.y, b.y));
 }

-ccl_device_inline float2 clamp(const float2 &a, const float2 &mn, const float2 &mx)
+ccl_device_inline float2 clamp(const float2 a, const float2 mn, const float2 mx)
 {
  return min(max(a, mn), mx);
 }

-ccl_device_inline float2 fabs(const float2 &a)
+ccl_device_inline float2 fabs(const float2 a)
 {
  return make_float2(fabsf(a.x), fabsf(a.y));
 }
@@ -229,28 +187,23 @@ ccl_device_inline float2 as_float2(const float4 &a)
  return make_float2(a.x, a.y);
 }

-ccl_device_inline float2 interp(const float2 &a, const float2 &b, float t)
+ccl_device_inline float2 interp(const float2 a, const float2 b, float t)
 {
  return a + t * (b - a);
 }

-ccl_device_inline float2 mix(const float2 &a, const float2 &b, float t)
+ccl_device_inline float2 mix(const float2 a, const float2 b, float t)
 {
  return a + t * (b - a);
 }

-ccl_device_inline float2 floor(const float2 &a)
+ccl_device_inline float2 floor(const float2 a)
 {
  return make_float2(floorf(a.x), floorf(a.y));
 }

 #endif /* !__KERNEL_METAL__ */

-ccl_device_inline float len(const float2 a)
-{
-  return sqrtf(dot(a, a));
-}
-
 ccl_device_inline float2 safe_divide_float2_float(const float2 a, const float b)
 {
  return (b != 0.0f) ? a / b : zero_float2();
--- a/intern/cycles/util/math_float3.h
+++ b/intern/cycles/util/math_float3.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
 * Copyright 2011-2022 Blender Foundation */

 #ifndef __UTIL_MATH_FLOAT3_H__
@@ -10,73 +11,6 @@

 CCL_NAMESPACE_BEGIN

-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float3 operator-(const float3 &a);
-ccl_device_inline float3 operator*(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator*(const float3 &a, const float f);
-ccl_device_inline float3 operator*(const float f, const float3 &a);
-ccl_device_inline float3 operator/(const float f, const float3 &a);
-ccl_device_inline float3 operator/(const float3 &a, const float f);
-ccl_device_inline float3 operator/(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator+(const float3 &a, const float f);
-ccl_device_inline float3 operator+(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator-(const float3 &a, const float f);
-ccl_device_inline float3 operator-(const float3 &a, const float3 &b);
-ccl_device_inline float3 operator+=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator-=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator*=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator*=(float3 &a, float f);
-ccl_device_inline float3 operator/=(float3 &a, const float3 &b);
-ccl_device_inline float3 operator/=(float3 &a, float f);
-
-ccl_device_inline bool operator==(const float3 &a, const float3 &b);
-ccl_device_inline bool operator!=(const float3 &a, const float3 &b);
-
-ccl_device_inline float distance(const float3 &a, const float3 &b);
-ccl_device_inline float dot(const float3 &a, const float3 &b);
-ccl_device_inline float dot_xy(const float3 &a, const float3 &b);
-ccl_device_inline float3 cross(const float3 &a, const float3 &b);
-ccl_device_inline float3 normalize(const float3 &a);
-ccl_device_inline float3 min(const float3 &a, const float3 &b);
-ccl_device_inline float3 max(const float3 &a, const float3 &b);
-ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx);
-ccl_device_inline float3 fabs(const float3 &a);
-ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t);
-ccl_device_inline float3 rcp(const float3 &a);
-ccl_device_inline float3 sqrt(const float3 &a);
-ccl_device_inline float3 floor(const float3 &a);
-ccl_device_inline float3 ceil(const float3 &a);
-ccl_device_inline float3 reflect(const float3 incident, const float3 normal);
-#endif /* !defined(__KERNEL_METAL__) */
-
-ccl_device_inline float reduce_min(float3 a);
-ccl_device_inline float reduce_max(float3 a);
-ccl_device_inline float len(const float3 a);
-ccl_device_inline float len_squared(const float3 a);
-
-ccl_device_inline float3 project(const float3 v, const float3 v_proj);
-
-ccl_device_inline float3 safe_normalize(const float3 a);
-ccl_device_inline float3 normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_normalize_len(const float3 a, ccl_private float *t);
-ccl_device_inline float3 safe_divide(const float3 a, const float3 b);
-ccl_device_inline float3 safe_divide(const float3 a, const float b);
-ccl_device_inline float3 interp(float3 a, float3 b, float t);
-ccl_device_inline float3 sqr(float3 a);
-
-ccl_device_inline bool is_zero(const float3 a);
-ccl_device_inline float reduce_add(const float3 a);
-ccl_device_inline float average(const float3 a);
-ccl_device_inline bool isequal(const float3 a, const float3 b);
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float3 zero_float3()
 {
 #ifdef __KERNEL_SSE__
@@ -109,7 +43,7 @@ ccl_device_inline float3 operator-(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator*(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_mul_ps(a.m128, b.m128));
@@ -118,7 +52,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 operator*(const float3 &a, const float f)
+ccl_device_inline float3 operator*(const float3 a, const float f)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_mul_ps(a.m128, _mm_set1_ps(f)));
@@ -127,7 +61,7 @@ ccl_device_inline float3 operator*(const float3 &a, const float f)
 #  endif
 }

-ccl_device_inline float3 operator*(const float f, const float3 &a)
+ccl_device_inline float3 operator*(const float f, const float3 a)
 {
 #  if defined(__KERNEL_SSE__)
  return float3(_mm_mul_ps(_mm_set1_ps(f), a.m128));
@@ -136,7 +70,7 @@ ccl_device_inline float3 operator*(const float f, const float3 &a)
 #  endif
 }

-ccl_device_inline float3 operator/(const float f, const float3 &a)
+ccl_device_inline float3 operator/(const float f, const float3 a)
 {
 #  if defined(__KERNEL_SSE__)
  return float3(_mm_div_ps(_mm_set1_ps(f), a.m128));
@@ -145,7 +79,7 @@ ccl_device_inline float3 operator/(const float f, const float3 &a)
 #  endif
 }

-ccl_device_inline float3 operator/(const float3 &a, const float f)
+ccl_device_inline float3 operator/(const float3 a, const float f)
 {
 #  if defined(__KERNEL_SSE__)
  return float3(_mm_div_ps(a.m128, _mm_set1_ps(f)));
@@ -154,7 +88,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float f)
 #  endif
 }

-ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator/(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE__)
  return float3(_mm_div_ps(a.m128, b.m128));
@@ -163,12 +97,7 @@ ccl_device_inline float3 operator/(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 operator+(const float3 &a, const float f)
-{
-  return a + make_float3(f, f, f);
-}
-
-ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator+(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_add_ps(a.m128, b.m128));
@@ -177,12 +106,12 @@ ccl_device_inline float3 operator+(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 operator-(const float3 &a, const float f)
+ccl_device_inline float3 operator+(const float3 a, const float f)
 {
-  return a - make_float3(f, f, f);
+  return a + make_float3(f, f, f);
 }

-ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
+ccl_device_inline float3 operator-(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_sub_ps(a.m128, b.m128));
@@ -191,17 +120,22 @@ ccl_device_inline float3 operator-(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 operator+=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator-(const float3 a, const float f)
+{
+  return a - make_float3(f, f, f);
+}
+
+ccl_device_inline float3 operator+=(float3 &a, const float3 b)
 {
  return a = a + b;
 }

-ccl_device_inline float3 operator-=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator-=(float3 &a, const float3 b)
 {
  return a = a - b;
 }

-ccl_device_inline float3 operator*=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator*=(float3 &a, const float3 b)
 {
  return a = a * b;
 }
@@ -211,7 +145,7 @@ ccl_device_inline float3 operator*=(float3 &a, float f)
  return a = a * f;
 }

-ccl_device_inline float3 operator/=(float3 &a, const float3 &b)
+ccl_device_inline float3 operator/=(float3 &a, const float3 b)
 {
  return a = a / b;
 }
@@ -223,7 +157,7 @@ ccl_device_inline float3 operator/=(float3 &a, float f)
 }

 #  if !(defined(__KERNEL_METAL__) || defined(__KERNEL_CUDA__))
-ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 &b)
+ccl_device_inline packed_float3 operator*=(packed_float3 &a, const float3 b)
 {
  a = float3(a) * b;
  return a;
@@ -235,7 +169,7 @@ ccl_device_inline packed_float3 operator*=(packed_float3 &a, float f)
  return a;
 }

-ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 &b)
+ccl_device_inline packed_float3 operator/=(packed_float3 &a, const float3 b)
 {
  a = float3(a) / b;
  return a;
@@ -248,7 +182,7 @@ ccl_device_inline packed_float3 operator/=(packed_float3 &a, float f)
 }
 #  endif

-ccl_device_inline bool operator==(const float3 &a, const float3 &b)
+ccl_device_inline bool operator==(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 7) == 7;
@@ -257,17 +191,12 @@ ccl_device_inline bool operator==(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline bool operator!=(const float3 &a, const float3 &b)
+ccl_device_inline bool operator!=(const float3 a, const float3 b)
 {
  return !(a == b);
 }

-ccl_device_inline float distance(const float3 &a, const float3 &b)
-{
-  return len(a - b);
-}
-
-ccl_device_inline float dot(const float3 &a, const float3 &b)
+ccl_device_inline float dot(const float3 a, const float3 b)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0x7F));
@@ -276,26 +205,62 @@ ccl_device_inline float dot(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float dot_xy(const float3 &a, const float3 &b)
+#endif
+
+ccl_device_inline float dot_xy(const float3 a, const float3 b)
 {
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
  return _mm_cvtss_f32(_mm_hadd_ps(_mm_mul_ps(a, b), b));
-#  else
+#else
  return a.x * b.x + a.y * b.y;
-#  endif
+#endif
 }

-ccl_device_inline float3 cross(const float3 &a, const float3 &b)
+ccl_device_inline float len(const float3 a)
+{
+#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+  return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
+#else
+  return sqrtf(dot(a, a));
+#endif
+}
+
+ccl_device_inline float reduce_min(float3 a)
+{
+  return min(min(a.x, a.y), a.z);
+}
+
+ccl_device_inline float reduce_max(float3 a)
+{
+  return max(max(a.x, a.y), a.z);
+}
+
+ccl_device_inline float len_squared(const float3 a)
+{
+  return dot(a, a);
+}
+
+#ifndef __KERNEL_METAL__
+
+ccl_device_inline float distance(const float3 a, const float3 b)
+{
+  return len(a - b);
+}
+
+ccl_device_inline float3 cross(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
-  return float3(shuffle<1, 2, 0, 3>(
-      msub(ssef(a), shuffle<1, 2, 0, 3>(ssef(b)), shuffle<1, 2, 0, 3>(ssef(a)) * ssef(b))));
+  const float4 x = float4(a.m128);
+  const float4 y = shuffle<1, 2, 0, 3>(float4(b.m128));
+  const float4 z = float4(_mm_mul_ps(shuffle<1, 2, 0, 3>(float4(a.m128)), float4(b.m128)));
+
+  return float3(shuffle<1, 2, 0, 3>(msub(x, y, z)).m128);
 #  else
  return make_float3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
 #  endif
 }

-ccl_device_inline float3 normalize(const float3 &a)
+ccl_device_inline float3 normalize(const float3 a)
 {
 #  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
  __m128 norm = _mm_sqrt_ps(_mm_dp_ps(a.m128, a.m128, 0x7F));
@@ -305,7 +270,7 @@ ccl_device_inline float3 normalize(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 min(const float3 &a, const float3 &b)
+ccl_device_inline float3 min(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_min_ps(a.m128, b.m128));
@@ -314,7 +279,7 @@ ccl_device_inline float3 min(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 max(const float3 &a, const float3 &b)
+ccl_device_inline float3 max(const float3 a, const float3 b)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_max_ps(a.m128, b.m128));
@@ -323,12 +288,12 @@ ccl_device_inline float3 max(const float3 &a, const float3 &b)
 #  endif
 }

-ccl_device_inline float3 clamp(const float3 &a, const float3 &mn, const float3 &mx)
+ccl_device_inline float3 clamp(const float3 a, const float3 mn, const float3 mx)
 {
  return min(max(a, mn), mx);
 }

-ccl_device_inline float3 fabs(const float3 &a)
+ccl_device_inline float3 fabs(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
 #    ifdef __KERNEL_NEON__
@@ -342,7 +307,7 @@ ccl_device_inline float3 fabs(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 sqrt(const float3 &a)
+ccl_device_inline float3 sqrt(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_sqrt_ps(a));
@@ -351,7 +316,7 @@ ccl_device_inline float3 sqrt(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 floor(const float3 &a)
+ccl_device_inline float3 floor(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_floor_ps(a));
@@ -360,7 +325,7 @@ ccl_device_inline float3 floor(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 ceil(const float3 &a)
+ccl_device_inline float3 ceil(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
  return float3(_mm_ceil_ps(a));
@@ -369,12 +334,12 @@ ccl_device_inline float3 ceil(const float3 &a)
 #  endif
 }

-ccl_device_inline float3 mix(const float3 &a, const float3 &b, float t)
+ccl_device_inline float3 mix(const float3 a, const float3 b, float t)
 {
  return a + t * (b - a);
 }

-ccl_device_inline float3 rcp(const float3 &a)
+ccl_device_inline float3 rcp(const float3 a)
 {
 #  ifdef __KERNEL_SSE__
  /* Don't use _mm_rcp_ps due to poor precision. */
@@ -399,33 +364,6 @@ ccl_device_inline float3 log(float3 v)
  return make_float3(logf(v.x), logf(v.y), logf(v.z));
 }

-#endif /* !__KERNEL_METAL__ */
-
-ccl_device_inline float reduce_min(float3 a)
-{
-  return min(min(a.x, a.y), a.z);
-}
-
-ccl_device_inline float reduce_max(float3 a)
-{
-  return max(max(a.x, a.y), a.z);
-}
-
-ccl_device_inline float len(const float3 a)
-{
-#if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-  return _mm_cvtss_f32(_mm_sqrt_ss(_mm_dp_ps(a.m128, a.m128, 0x7F)));
-#else
-  return sqrtf(dot(a, a));
-#endif
-}
-
-ccl_device_inline float len_squared(const float3 a)
-{
-  return dot(a, a);
-}
-
-#if !defined(__KERNEL_METAL__)
 ccl_device_inline float3 reflect(const float3 incident, const float3 normal)
 {
  float3 unit_normal = normalize(normal);
--- a/intern/cycles/util/math_float4.h
+++ b/intern/cycles/util/math_float4.h
@@ -1,4 +1,5 @@
 /* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2013 Intel Corporation
 * Copyright 2011-2022 Blender Foundation */

 #ifndef __UTIL_MATH_FLOAT4_H__
@@ -10,85 +11,6 @@

 CCL_NAMESPACE_BEGIN

-/*******************************************************************************
- * Declaration.
- */
-
-#if !defined(__KERNEL_METAL__)
-ccl_device_inline float4 operator-(const float4 &a);
-ccl_device_inline float4 operator*(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator*(const float4 &a, float f);
-ccl_device_inline float4 operator*(float f, const float4 &a);
-ccl_device_inline float4 operator/(const float4 &a, float f);
-ccl_device_inline float4 operator/(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+(const float4 &a, const float f);
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator-(const float4 &a, const float f);
-ccl_device_inline float4 operator-(const float4 &a, const float4 &b);
-ccl_device_inline float4 operator+=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, const float4 &b);
-ccl_device_inline float4 operator*=(float4 &a, float f);
-ccl_device_inline float4 operator/=(float4 &a, float f);
-
-ccl_device_inline int4 operator<(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b);
-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b);
-ccl_device_inline bool operator==(const float4 &a, const float4 &b);
-
-ccl_device_inline float distance(const float4 &a, const float4 &b);
-ccl_device_inline float dot(const float4 &a, const float4 &b);
-ccl_device_inline float len_squared(const float4 &a);
-ccl_device_inline float4 rcp(const float4 &a);
-ccl_device_inline float4 sqrt(const float4 &a);
-ccl_device_inline float4 sqr(const float4 &a);
-ccl_device_inline float4 cross(const float4 &a, const float4 &b);
-ccl_device_inline bool is_zero(const float4 &a);
-ccl_device_inline float average(const float4 &a);
-ccl_device_inline float len(const float4 &a);
-ccl_device_inline float4 normalize(const float4 &a);
-ccl_device_inline float4 safe_normalize(const float4 &a);
-ccl_device_inline float4 min(const float4 &a, const float4 &b);
-ccl_device_inline float4 max(const float4 &a, const float4 &b);
-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx);
-ccl_device_inline float4 fabs(const float4 &a);
-ccl_device_inline float4 floor(const float4 &a);
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t);
-#endif /* !__KERNEL_METAL__*/
-
-ccl_device_inline float4 safe_divide(const float4 a, const float4 b);
-ccl_device_inline float4 safe_divide(const float4 a, const float b);
-
-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b);
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b);
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b);
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b);
-
-#  ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b);
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b);
-#  endif
-#endif /* __KERNEL_SSE__ */
-
-ccl_device_inline float reduce_min(const float4 a);
-ccl_device_inline float reduce_max(const float4 a);
-ccl_device_inline float reduce_add(const float4 a);
-
-ccl_device_inline bool isequal(const float4 a, const float4 b);
-
-#ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b);
-#endif /* !__KERNEL_GPU__ */
-
-/*******************************************************************************
- * Definition.
- */
-
 ccl_device_inline float4 zero_float4()
 {
 #ifdef __KERNEL_SSE__
@@ -103,6 +25,16 @@ ccl_device_inline float4 one_float4()
  return make_float4(1.0f, 1.0f, 1.0f, 1.0f);
 }

+ccl_device_inline int4 cast(const float4 a)
+{
+#ifdef __KERNEL_SSE__
+  return int4(_mm_castps_si128(a));
+#else
+  return make_int4(
+      __float_as_int(a.x), __float_as_int(a.y), __float_as_int(a.z), __float_as_int(a.w));
+#endif
+}
+
 #if !defined(__KERNEL_METAL__)
 ccl_device_inline float4 operator-(const float4 &a)
 {
@@ -114,7 +46,7 @@ ccl_device_inline float4 operator-(const float4 &a)
 #  endif
 }

-ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator*(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_mul_ps(a.m128, b.m128));
@@ -123,7 +55,7 @@ ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 operator*(const float4 &a, float f)
+ccl_device_inline float4 operator*(const float4 a, float f)
 {
 #  if defined(__KERNEL_SSE__)
  return a * make_float4(f);
@@ -132,17 +64,17 @@ ccl_device_inline float4 operator*(const float4 &a, float f)
 #  endif
 }

-ccl_device_inline float4 operator*(float f, const float4 &a)
+ccl_device_inline float4 operator*(float f, const float4 a)
 {
  return a * f;
 }

-ccl_device_inline float4 operator/(const float4 &a, float f)
+ccl_device_inline float4 operator/(const float4 a, float f)
 {
  return a * (1.0f / f);
 }

-ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator/(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_div_ps(a.m128, b.m128));
@@ -151,12 +83,7 @@ ccl_device_inline float4 operator/(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 operator+(const float4 &a, const float f)
-{
-  return a + make_float4(f, f, f, f);
-}
-
-ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator+(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_add_ps(a.m128, b.m128));
@@ -165,12 +92,12 @@ ccl_device_inline float4 operator+(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 operator-(const float4 &a, const float f)
+ccl_device_inline float4 operator+(const float4 a, const float f)
 {
-  return a - make_float4(f, f, f, f);
+  return a + make_float4(f);
 }

-ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_sub_ps(a.m128, b.m128));
@@ -179,17 +106,22 @@ ccl_device_inline float4 operator-(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-(const float4 a, const float f)
+{
+  return a - make_float4(f);
+}
+
+ccl_device_inline float4 operator+=(float4 &a, const float4 b)
 {
  return a = a + b;
 }

-ccl_device_inline float4 operator-=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator-=(float4 &a, const float4 b)
 {
  return a = a - b;
 }

-ccl_device_inline float4 operator*=(float4 &a, const float4 &b)
+ccl_device_inline float4 operator*=(float4 &a, const float4 b)
 {
  return a = a * b;
 }
@@ -204,7 +136,7 @@ ccl_device_inline float4 operator/=(float4 &a, float f)
  return a = a / f;
 }

-ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_castps_si128(_mm_cmplt_ps(a.m128, b.m128)));
@@ -213,7 +145,7 @@ ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator>=(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_castps_si128(_mm_cmpge_ps(a.m128, b.m128)));
@@ -222,7 +154,7 @@ ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
+ccl_device_inline int4 operator<=(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return int4(_mm_castps_si128(_mm_cmple_ps(a.m128, b.m128)));
@@ -231,7 +163,7 @@ ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline bool operator==(const float4 &a, const float4 &b)
+ccl_device_inline bool operator==(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return (_mm_movemask_ps(_mm_cmpeq_ps(a.m128, b.m128)) & 15) == 15;
@@ -240,95 +172,19 @@ ccl_device_inline bool operator==(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float distance(const float4 &a, const float4 &b)
-{
-  return len(a - b);
-}
-
-ccl_device_inline float dot(const float4 &a, const float4 &b)
-{
-#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  __m128 t = vmulq_f32(a, b);
-  return vaddvq_f32(t);
-#    else
-  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
-#    endif
-#  else
-  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
-#  endif
-}
-
-ccl_device_inline float len_squared(const float4 &a)
-{
-  return dot(a, a);
-}
-
-ccl_device_inline float4 rcp(const float4 &a)
+ccl_device_inline const float4 operator^(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
-  /* Don't use _mm_rcp_ps due to poor precision. */
-  return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
+  return float4(_mm_xor_ps(a.m128, b.m128));
 #  else
-  return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+  return make_float4(__uint_as_float(__float_as_uint(a.x) ^ __float_as_uint(b.x)),
+                     __uint_as_float(__float_as_uint(a.y) ^ __float_as_uint(b.y)),
+                     __uint_as_float(__float_as_uint(a.z) ^ __float_as_uint(b.z)),
+                     __uint_as_float(__float_as_uint(a.w) ^ __float_as_uint(b.w)));
 #  endif
 }

-ccl_device_inline float4 sqrt(const float4 &a)
-{
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_sqrt_ps(a.m128));
-#  else
-  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
-#  endif
-}
-
-ccl_device_inline float4 sqr(const float4 &a)
-{
-  return a * a;
-}
-
-ccl_device_inline float4 cross(const float4 &a, const float4 &b)
-{
-#  ifdef __KERNEL_SSE__
-  return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
-         (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
-#  else
-  return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
-#  endif
-}
-
-ccl_device_inline bool is_zero(const float4 &a)
-{
-#  ifdef __KERNEL_SSE__
-  return a == zero_float4();
-#  else
-  return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
-#  endif
-}
-
-ccl_device_inline float average(const float4 &a)
-{
-  return reduce_add(a) * 0.25f;
-}
-
-ccl_device_inline float len(const float4 &a)
-{
-  return sqrtf(dot(a, a));
-}
-
-ccl_device_inline float4 normalize(const float4 &a)
-{
-  return a / len(a);
-}
-
-ccl_device_inline float4 safe_normalize(const float4 &a)
-{
-  float t = len(a);
-  return (t != 0.0f) ? a / t : a;
-}
-
-ccl_device_inline float4 min(const float4 &a, const float4 &b)
+ccl_device_inline float4 min(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_min_ps(a.m128, b.m128));
@@ -337,7 +193,7 @@ ccl_device_inline float4 min(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 max(const float4 &a, const float4 &b)
+ccl_device_inline float4 max(const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
  return float4(_mm_max_ps(a.m128, b.m128));
@@ -346,55 +202,119 @@ ccl_device_inline float4 max(const float4 &a, const float4 &b)
 #  endif
 }

-ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
+ccl_device_inline float4 clamp(const float4 a, const float4 mn, const float4 mx)
 {
  return min(max(a, mn), mx);
 }
-
-ccl_device_inline float4 fabs(const float4 &a)
-{
-#  if defined(__KERNEL_SSE__)
-#    if defined(__KERNEL_NEON__)
-  return float4(vabsq_f32(a));
-#    else
-  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
-#    endif
-#  else
-  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-#  endif
-}
-
-ccl_device_inline float4 floor(const float4 &a)
-{
-#  ifdef __KERNEL_SSE__
-  return float4(_mm_floor_ps(a));
-#  else
-  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-#  endif
-}
-
-ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
-{
-  return a + t * (b - a);
-}
-
-ccl_device_inline float4 saturate(const float4 &a)
-{
-  return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
-}
-
-ccl_device_inline float4 exp(float4 v)
-{
-  return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
-}
-
-ccl_device_inline float4 log(float4 v)
-{
-  return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
-}
-
 #endif /* !__KERNEL_METAL__*/

+ccl_device_inline const float4 madd(const float4 a, const float4 b, const float4 c)
+{
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
+  return float4(vfmaq_f32(c, a, b));
+#  elif defined(__KERNEL_AVX2__)
+  return float4(_mm_fmadd_ps(a, b, c));
+#  else
+  return a * b + c;
+#  endif
+#else
+  return a * b + c;
+#endif
+}
+
+ccl_device_inline float4 msub(const float4 a, const float4 b, const float4 c)
+{
+#ifdef __KERNEL_SSE__
+#  ifdef __KERNEL_NEON__
+  return float4(vfmaq_f32(vnegq_f32(c), a, b));
+#  elif defined(__KERNEL_AVX2__)
+  return float4(_mm_fmsub_ps(a, b, c));
+#  else
+  return a * b - c;
+#  endif
+#else
+  return a * b - c;
+#endif
+}
+
+#ifdef __KERNEL_SSE__
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 b)
+{
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(b.m128));
+#  else
+  return float4(
+      _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0))));
+#  endif
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a)
+{
+  return float4(_mm_movelh_ps(a, a));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a)
+{
+  return float4(_mm_movehl_ps(a, a));
+}
+
+#  ifdef __KERNEL_SSE3__
+template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 b)
+{
+  return float4(_mm_moveldup_ps(b));
+}
+
+template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 b)
+{
+  return float4(_mm_movehdup_ps(b));
+}
+#  endif /* __KERNEL_SSE3__ */
+
+template<size_t i0, size_t i1, size_t i2, size_t i3>
+__forceinline const float4 shuffle(const float4 a, const float4 b)
+{
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i1, i2, i3>(a, b));
+#  else
+  return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0)));
+#  endif
+}
+
+template<size_t i0> __forceinline const float4 shuffle(const float4 b)
+{
+  return shuffle<i0, i0, i0, i0>(b);
+}
+template<size_t i0> __forceinline const float4 shuffle(const float4 a, const float4 b)
+{
+#  ifdef __KERNEL_NEON__
+  return float4(shuffle_neon<float32x4_t, i0, i0, i0, i0>(a, b));
+#  else
+  return float4(_mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0)));
+#  endif
+}
+
+template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 a, const float4 b)
+{
+  return float4(_mm_movelh_ps(a, b));
+}
+
+template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 a, const float4 b)
+{
+  return float4(_mm_movehl_ps(b, a));
+}
+
+template<size_t i> __forceinline float extract(const float4 a)
+{
+  return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
+}
+template<> __forceinline float extract<0>(const float4 a)
+{
+  return _mm_cvtss_f32(a);
+}
+#endif
+
 ccl_device_inline float reduce_add(const float4 a)
 {
 #if defined(__KERNEL_SSE__)
@@ -440,6 +360,166 @@ ccl_device_inline float reduce_max(const float4 a)
 #endif
 }

+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float dot(const float4 a, const float4 b)
+{
+#  if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  __m128 t = vmulq_f32(a, b);
+  return vaddvq_f32(t);
+#    else
+  return _mm_cvtss_f32(_mm_dp_ps(a, b, 0xFF));
+#    endif
+#  else
+  return (a.x * b.x + a.y * b.y) + (a.z * b.z + a.w * b.w);
+#  endif
+}
+#endif /* !defined(__KERNEL_METAL__) */
+
+ccl_device_inline float len(const float4 a)
+{
+  return sqrtf(dot(a, a));
+}
+
+ccl_device_inline float len_squared(const float4 a)
+{
+  return dot(a, a);
+}
+
+#if !defined(__KERNEL_METAL__)
+ccl_device_inline float distance(const float4 a, const float4 b)
+{
+  return len(a - b);
+}
+
+ccl_device_inline float4 rcp(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+  /* Don't use _mm_rcp_ps due to poor precision. */
+  return float4(_mm_div_ps(_mm_set_ps1(1.0f), a.m128));
+#  else
+  return make_float4(1.0f / a.x, 1.0f / a.y, 1.0f / a.z, 1.0f / a.w);
+#  endif
+}
+
+ccl_device_inline float4 sqrt(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+  return float4(_mm_sqrt_ps(a.m128));
+#  else
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 sqr(const float4 a)
+{
+  return a * a;
+}
+
+ccl_device_inline float4 cross(const float4 a, const float4 b)
+{
+#  ifdef __KERNEL_SSE__
+  return (shuffle<1, 2, 0, 0>(a) * shuffle<2, 0, 1, 0>(b)) -
+         (shuffle<2, 0, 1, 0>(a) * shuffle<1, 2, 0, 0>(b));
+#  else
+  return make_float4(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x, 0.0f);
+#  endif
+}
+
+ccl_device_inline bool is_zero(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+  return a == zero_float4();
+#  else
+  return (a.x == 0.0f && a.y == 0.0f && a.z == 0.0f && a.w == 0.0f);
+#  endif
+}
+
+ccl_device_inline float average(const float4 a)
+{
+  return reduce_add(a) * 0.25f;
+}
+
+ccl_device_inline float4 normalize(const float4 a)
+{
+  return a / len(a);
+}
+
+ccl_device_inline float4 safe_normalize(const float4 a)
+{
+  float t = len(a);
+  return (t != 0.0f) ? a / t : a;
+}
+
+ccl_device_inline float4 fabs(const float4 a)
+{
+#  if defined(__KERNEL_SSE__)
+#    if defined(__KERNEL_NEON__)
+  return float4(vabsq_f32(a));
+#    else
+  return float4(_mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
+#    endif
+#  else
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 floor(const float4 a)
+{
+#  ifdef __KERNEL_SSE__
+#    if defined(__KERNEL_NEON__)
+  return float4(vrndmq_f32(a));
+#    else
+  return float4(_mm_floor_ps(a));
+#    endif
+#  else
+  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+#  endif
+}
+
+ccl_device_inline float4 floorfrac(const float4 x, ccl_private int4 *i)
+{
+#  ifdef __KERNEL_SSE__
+  const float4 f = floor(x);
+  *i = int4(_mm_cvttps_epi32(f.m128));
+  return x - f;
+#  else
+  float4 r;
+  r.x = floorfrac(x.x, &i->x);
+  r.y = floorfrac(x.y, &i->y);
+  r.z = floorfrac(x.z, &i->z);
+  r.w = floorfrac(x.w, &i->w);
+  return r;
+#  endif
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, float t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float4 mix(const float4 a, const float4 b, const float4 t)
+{
+  return a + t * (b - a);
+}
+
+ccl_device_inline float4 saturate(const float4 a)
+{
+  return make_float4(saturatef(a.x), saturatef(a.y), saturatef(a.z), saturatef(a.w));
+}
+
+ccl_device_inline float4 exp(float4 v)
+{
+  return make_float4(expf(v.x), expf(v.y), expf(v.z), expf(v.z));
+}
+
+ccl_device_inline float4 log(float4 v)
+{
+  return make_float4(logf(v.x), logf(v.y), logf(v.z), logf(v.z));
+}
+
+#endif /* !__KERNEL_METAL__*/
+
 ccl_device_inline bool isequal(const float4 a, const float4 b)
 {
 #if defined(__KERNEL_METAL__)
@@ -449,68 +529,23 @@ ccl_device_inline bool isequal(const float4 a, const float4 b)
 #endif
 }

-#ifdef __KERNEL_SSE__
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
-#  else
-  return float4(_mm_castsi128_ps(
-      _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
-#  endif
-}
-
-template<size_t index_0, size_t index_1, size_t index_2, size_t index_3>
-__forceinline const float4 shuffle(const float4 &a, const float4 &b)
-{
-#  if defined(__KERNEL_NEON__)
-  return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(a.m128, b.m128));
-#  else
-  return float4(_mm_shuffle_ps(a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
-#  endif
-}
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &b)
-{
-  return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
-}
-
-template<> __forceinline const float4 shuffle<0, 1, 0, 1>(const float4 &a, const float4 &b)
-{
-  return float4(_mm_movelh_ps(a.m128, b.m128));
-}
-
-template<> __forceinline const float4 shuffle<2, 3, 2, 3>(const float4 &a, const float4 &b)
-{
-  return float4(_mm_movehl_ps(b.m128, a.m128));
-}
-
-#  ifdef __KERNEL_SSE3__
-template<> __forceinline const float4 shuffle<0, 0, 2, 2>(const float4 &b)
-{
-  return float4(_mm_moveldup_ps(b));
-}
-
-template<> __forceinline const float4 shuffle<1, 1, 3, 3>(const float4 &b)
-{
-  return float4(_mm_movehdup_ps(b));
-}
-#  endif /* __KERNEL_SSE3__ */
-#endif   /* __KERNEL_SSE__ */
-
 #ifndef __KERNEL_GPU__
-ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b)
+ccl_device_inline float4 select(const int4 mask, const float4 a, const float4 b)
 {
 #  ifdef __KERNEL_SSE__
+#    ifdef __KERNEL_SSE41__
  return float4(_mm_blendv_ps(b.m128, a.m128, _mm_castsi128_ps(mask.m128)));
+#    else
+  return float4(
+      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), a), _mm_andnot_ps(_mm_castsi128_ps(mask), b)));
+#    endif
 #  else
  return make_float4(
      (mask.x) ? a.x : b.x, (mask.y) ? a.y : b.y, (mask.z) ? a.z : b.z, (mask.w) ? a.w : b.w);
 #  endif
 }

-ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
+ccl_device_inline float4 mask(const int4 mask, const float4 a)
 {
  /* Replace elements of x with zero where mask isn't set. */
  return select(mask, a, zero_float4());
--- a/Show More
+++ b/Show More