Proof of Concept: File Browser thumbnail mode using grid view

This was meant as an experiment to see how tangible it is to rewrite the File Browser UI code to be based on views, starting with the grid view for thumbnail mode. See T99890. My initial conclusion is that porting to views is quite doable, but we'll need some further UI code features to make certain things possible. Like big "composed" icons, where a file type icon is displayed on top of a big, generic file icon. There is a fair bit of stuff here that I'm not happy with. Plus things like selection, double clicking to open and renaming don't work yet. It's a start, a proof of concept even :)
Merge branch 'asset-browser-grid-view' into file-browser-grid-view
2022-07-21 17:16:10 +02:00 · 2022-07-20 17:25:31 +02:00 · 2022-07-20 17:17:12 +02:00 · 2022-07-20 17:16:15 +02:00 · 2022-07-20 17:16:15 +02:00 · 2022-07-20 16:08:02 +02:00
810 changed files with 17615 additions and 10549 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -565,15 +565,19 @@ endif()
 option(WITH_OPENGL              "When off limits visibility of the opengl headers to just bf_gpu and gawain (temporary option for development purposes)" ON)
 option(WITH_GLEW_ES             "Switches to experimental copy of GLEW that has support for OpenGL ES. (temporary option for development purposes)" OFF)
 option(WITH_GL_PROFILE_ES20     "Support using OpenGL ES 2.0. (through either EGL or the AGL/WGL/XGL 'es20' profile)"                               OFF)
-option(WITH_GPU_SHADER_BUILDER  "Shader builder is a developer option enabling linting on GLSL during compilation"                                  OFF)
+option(WITH_GPU_BUILDTIME_SHADER_BUILDER  "Shader builder is a developer option enabling linting on GLSL during compilation"                                  OFF)

 mark_as_advanced(
  WITH_OPENGL
  WITH_GLEW_ES
  WITH_GL_PROFILE_ES20
-  WITH_GPU_SHADER_BUILDER
+  WITH_GPU_BUILDTIME_SHADER_BUILDER
 )

+if(WITH_HEADLESS)
+  set(WITH_OPENGL OFF)
+endif()
+
 # Metal

 if (APPLE)
--- a/build_files/build_environment/cmake/dpcpp.cmake
+++ b/build_files/build_environment/cmake/dpcpp.cmake
@@ -63,6 +63,8 @@ set(DPCPP_EXTRA_ARGS
  -DPython3_ROOT_DIR=${LIBDIR}/python/
  -DPython3_EXECUTABLE=${PYTHON_BINARY}
  -DPYTHON_EXECUTABLE=${PYTHON_BINARY}
+  -DLLDB_ENABLE_CURSES=OFF
+  -DLLVM_ENABLE_TERMINFO=OFF
 )

 if(WIN32)
--- a/build_files/build_environment/cmake/embree.cmake
+++ b/build_files/build_environment/cmake/embree.cmake
@@ -10,18 +10,12 @@ set(EMBREE_EXTRA_ARGS
  -DEMBREE_RAY_MASK=ON
  -DEMBREE_FILTER_FUNCTION=ON
  -DEMBREE_BACKFACE_CULLING=OFF
-  -DEMBREE_MAX_ISA=AVX2
  -DEMBREE_TASKING_SYSTEM=TBB
  -DEMBREE_TBB_ROOT=${LIBDIR}/tbb
  -DTBB_ROOT=${LIBDIR}/tbb
-  -DTBB_STATIC_LIB=${TBB_STATIC_LIBRARY}
 )

-if(BLENDER_PLATFORM_ARM)
-  set(EMBREE_EXTRA_ARGS
-    ${EMBREE_EXTRA_ARGS}
-    -DEMBREE_MAX_ISA=NEON)
-else()
+if (NOT BLENDER_PLATFORM_ARM)
  set(EMBREE_EXTRA_ARGS
    ${EMBREE_EXTRA_ARGS}
    -DEMBREE_MAX_ISA=AVX2)
@@ -30,23 +24,10 @@ endif()
 if(TBB_STATIC_LIBRARY)
  set(EMBREE_EXTRA_ARGS
    ${EMBREE_EXTRA_ARGS}
-    -DEMBREE_TBB_LIBRARY_NAME=tbb_static
-    -DEMBREE_TBBMALLOC_LIBRARY_NAME=tbbmalloc_static
+    -DEMBREE_TBB_COMPONENT=tbb_static
  )
 endif()

-if(WIN32)
-  set(EMBREE_BUILD_DIR ${BUILD_MODE}/)
-  if(BUILD_MODE STREQUAL Debug)
-    list(APPEND EMBREE_EXTRA_ARGS
-     -DEMBREE_TBBMALLOC_LIBRARY_NAME=tbbmalloc_debug
-     -DEMBREE_TBB_LIBRARY_NAME=tbb_debug
-    )
-  endif()
-else()
-  set(EMBREE_BUILD_DIR)
-endif()
-
 ExternalProject_Add(external_embree
  URL file://${PACKAGE_DIR}/${EMBREE_FILE}
  DOWNLOAD_DIR ${DOWNLOAD_DIR}
--- a/build_files/build_environment/cmake/versions.cmake
+++ b/build_files/build_environment/cmake/versions.cmake
@@ -410,9 +410,9 @@ set(SQLITE_HASH fb558c49ee21a837713c4f1e7e413309aabdd9c7)
 set(SQLITE_HASH_TYPE SHA1)
 set(SQLITE_FILE sqlite-src-3240000.zip)

-set(EMBREE_VERSION 3.13.3)
+set(EMBREE_VERSION 3.13.4)
 set(EMBREE_URI https://github.com/embree/embree/archive/v${EMBREE_VERSION}.zip)
-set(EMBREE_HASH f62766ba54e48a2f327c3a22596e7133)
+set(EMBREE_HASH 52d0be294d6c88ba7a6c9e046796e7be)
 set(EMBREE_HASH_TYPE MD5)
 set(EMBREE_FILE embree-v${EMBREE_VERSION}.zip)

--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -567,7 +567,7 @@ OPENCOLLADA_FORCE_BUILD=false
 OPENCOLLADA_FORCE_REBUILD=false
 OPENCOLLADA_SKIP=false

-EMBREE_VERSION="3.13.3"
+EMBREE_VERSION="3.13.4"
 EMBREE_VERSION_SHORT="3.13"
 EMBREE_VERSION_MIN="3.13"
 EMBREE_VERSION_MEX="4.0"
--- a/build_files/build_environment/patches/embree.diff
+++ b/build_files/build_environment/patches/embree.diff
@@ -1,30 +1,37 @@
-diff -Naur orig/common/sys/platform.h external_embree/common/sys/platform.h
--- orig/common/sys/platform.h	2020-05-13 23:08:53 -0600
-+++ external_embree/common/sys/platform.h	2020-06-13 17:40:26 -0600
-@@ -84,8 +84,8 @@
- ////////////////////////////////////////////////////////////////////////////////
+diff -Naur org/kernels/rtcore_config.h.in embree-3.13.4/kernels/rtcore_config.h.in
+--- org/kernels/rtcore_config.h.in      2022-06-14 22:13:52 -0600
+++ embree-3.13.4/kernels/rtcore_config.h.in    2022-06-24 15:20:12 -0600
+@@ -14,6 +14,7 @@
+ #cmakedefine01 EMBREE_MIN_WIDTH
+ #define RTC_MIN_WIDTH EMBREE_MIN_WIDTH
+
+#cmakedefine EMBREE_STATIC_LIB
+ #cmakedefine EMBREE_API_NAMESPACE
+
+ #if defined(EMBREE_API_NAMESPACE)
+diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt
+index 7c2f43d..106b1d5 100644
+--- a/kernels/CMakeLists.txt
+++ b/kernels/CMakeLists.txt
+@@ -201,6 +201,12 @@ embree_files(EMBREE_LIBRARY_FILES_AVX512 ${AVX512})
+ #message("AVX2: ${EMBREE_LIBRARY_FILES_AVX2}")
+ #message("AVX512: ${EMBREE_LIBRARY_FILES_AVX512}")
 
- #ifdef __WIN32__
-#define dll_export __declspec(dllexport)
-#define dll_import __declspec(dllimport)
-+#define dll_export 
-+#define dll_import 
- #else
- #define dll_export __attribute__ ((visibility ("default")))
- #define dll_import 
-diff --git orig/common/tasking/CMakeLists.txt external_embree/common/tasking/CMakeLists.txt
--- orig/common/tasking/CMakeLists.txt
-+++ external_embree/common/tasking/CMakeLists.txt
-@@ -27,7 +27,11 @@
-     else()
-       # If not found try getting older TBB via module (FindTBB.cmake)
-       unset(TBB_DIR CACHE)
-      find_package(TBB 4.1 REQUIRED tbb)
-+      if (TBB_STATIC_LIB)
-+        find_package(TBB 4.1 REQUIRED tbb_static)
-+      else()
-+        find_package(TBB 4.1 REQUIRED tbb)
-+      endif()
-       if (TBB_FOUND)
-         TARGET_LINK_LIBRARIES(tasking PUBLIC TBB)
-         TARGET_INCLUDE_DIRECTORIES(tasking PUBLIC "${TBB_INCLUDE_DIRS}")
+# Bundle Neon2x into the main static library.
+IF(EMBREE_ISA_NEON2X AND EMBREE_STATIC_LIB)
+  LIST(APPEND EMBREE_LIBRARY_FILES ${EMBREE_LIBRARY_FILES_AVX2})
+  LIST(REMOVE_DUPLICATES EMBREE_LIBRARY_FILES)
+ENDIF()
+
+ # replaces all .cpp files with a dummy file that includes that .cpp file
+ # this is to work around an ICC name mangling issue related to lambda functions under windows
+ MACRO (CreateISADummyFiles list isa)
+@@ -277,7 +283,7 @@ IF (EMBREE_ISA_AVX  AND EMBREE_LIBRARY_FILES_AVX)
+   ENDIF()
+ ENDIF()
+ 
+-IF (EMBREE_ISA_AVX2 AND EMBREE_LIBRARY_FILES_AVX2)
+IF (EMBREE_ISA_AVX2 AND EMBREE_LIBRARY_FILES_AVX2 AND NOT (EMBREE_ISA_NEON2X AND EMBREE_STATIC_LIB))
+   DISABLE_STACK_PROTECTOR_FOR_INTERSECTORS(${EMBREE_LIBRARY_FILES_AVX2})
+   ADD_LIBRARY(embree_avx2 STATIC ${EMBREE_LIBRARY_FILES_AVX2})
+   TARGET_LINK_LIBRARIES(embree_avx2 PRIVATE tasking)
--- a/build_files/cmake/config/blender_release.cmake
+++ b/build_files/cmake/config/blender_release.cmake
@@ -71,7 +71,6 @@ if(NOT WIN32)
 endif()
 if(WIN32)
  set(WITH_WASAPI               ON  CACHE BOOL "" FORCE)
-  set(WITH_CYCLES_DEVICE_ONEAPI ON  CACHE BOOL "" FORCE)
 endif()
 if(UNIX AND NOT APPLE)
  set(WITH_DOC_MANPAGE         ON  CACHE BOOL "" FORCE)
@@ -92,6 +91,7 @@ if(NOT APPLE)
  set(WITH_CYCLES_CUDA_BINARIES   ON  CACHE BOOL "" FORCE)
  set(WITH_CYCLES_CUBIN_COMPILER  OFF CACHE BOOL "" FORCE)
  set(WITH_CYCLES_HIP_BINARIES    ON  CACHE BOOL "" FORCE)
+  set(WITH_CYCLES_DEVICE_ONEAPI   ON  CACHE BOOL "" FORCE)

  # Disable AoT kernels compilations until buildbot can deliver them in a reasonabel time.
  set(WITH_CYCLES_ONEAPI_BINARIES OFF CACHE BOOL "" FORCE)
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -1529,7 +1529,8 @@ def pyrna2sphinx(basepath):
        else:
            fw(".. class:: %s\n\n" % struct_id)

-        fw("   %s\n\n" % struct.description)
+        write_indented_lines("   ", fw, struct.description, False)
+        fw("\n")

        # Properties sorted in alphabetical order.
        sorted_struct_properties = struct.properties[:]
--- a/extern/audaspace/bindings/C/AUD_Special.cpp
+++ b/extern/audaspace/bindings/C/AUD_Special.cpp
@@ -270,7 +270,7 @@ AUD_API int AUD_readSound(AUD_Sound* sound, float* buffer, int length, int sampl
 	return length;
 }

-AUD_API const char* AUD_mixdown(AUD_Sound* sound, unsigned int start, unsigned int length, unsigned int buffersize, const char* filename, AUD_DeviceSpecs specs, AUD_Container format, AUD_Codec codec, unsigned int bitrate, void(*callback)(float, void*), void* data)
+AUD_API int AUD_mixdown(AUD_Sound* sound, unsigned int start, unsigned int length, unsigned int buffersize, const char* filename, AUD_DeviceSpecs specs, AUD_Container format, AUD_Codec codec, unsigned int bitrate, void(*callback)(float, void*), void* data, char* error, size_t errorsize)
 {
 	try
 	{
@@ -282,15 +282,20 @@ AUD_API const char* AUD_mixdown(AUD_Sound* sound, unsigned int start, unsigned i
 		std::shared_ptr<IWriter> writer = FileWriter::createWriter(filename, convCToDSpec(specs), static_cast<Container>(format), static_cast<Codec>(codec), bitrate);
 		FileWriter::writeReader(reader, writer, length, buffersize, callback, data);

-		return nullptr;
+		return true;
 	}
 	catch(Exception& e)
 	{
-		return e.getMessage().c_str();
+		if(error && errorsize)
+		{
+			std::strncpy(error, e.getMessage().c_str(), errorsize);
+			error[errorsize - 1] = '\0';
+		}
+		return false;
 	}
 }

-AUD_API const char* AUD_mixdown_per_channel(AUD_Sound* sound, unsigned int start, unsigned int length, unsigned int buffersize, const char* filename, AUD_DeviceSpecs specs, AUD_Container format, AUD_Codec codec, unsigned int bitrate, void(*callback)(float, void*), void* data)
+AUD_API int AUD_mixdown_per_channel(AUD_Sound* sound, unsigned int start, unsigned int length, unsigned int buffersize, const char* filename, AUD_DeviceSpecs specs, AUD_Container format, AUD_Codec codec, unsigned int bitrate, void(*callback)(float, void*), void* data, char* error, size_t errorsize)
 {
 	try
 	{
@@ -328,11 +333,16 @@ AUD_API const char* AUD_mixdown_per_channel(AUD_Sound* sound, unsigned int start
 		reader->seek(start);
 		FileWriter::writeReader(reader, writers, length, buffersize, callback, data);

-		return nullptr;
+		return true;
 	}
 	catch(Exception& e)
 	{
-		return e.getMessage().c_str();
+		if(error && errorsize)
+		{
+			std::strncpy(error, e.getMessage().c_str(), errorsize);
+			error[errorsize - 1] = '\0';
+		}
+		return false;
 	}
 }

--- a/extern/audaspace/bindings/C/AUD_Special.h
+++ b/extern/audaspace/bindings/C/AUD_Special.h
@@ -70,13 +70,15 @@ extern AUD_API int AUD_readSound(AUD_Sound* sound, float* buffer, int length, in
 * \param bitrate The bitrate for encoding.
 * \param callback A callback function that is called periodically during mixdown, reporting progress if length > 0. Can be NULL.
 * \param data Pass through parameter that is passed to the callback.
- * \return An error message or NULL in case of success.
+ * \param error String buffer to copy the error message to in case of failure.
+ * \param errorsize The size of the error buffer.
+ * \return Whether or not the operation succeeded.
 */
-extern AUD_API const char* AUD_mixdown(AUD_Sound* sound, unsigned int start, unsigned int length,
+extern AUD_API int AUD_mixdown(AUD_Sound* sound, unsigned int start, unsigned int length,
 							   unsigned int buffersize, const char* filename,
 							   AUD_DeviceSpecs specs, AUD_Container format,
 							   AUD_Codec codec, unsigned int bitrate,
-							   void(*callback)(float, void*), void* data);
+							   void(*callback)(float, void*), void* data, char* error, size_t errorsize);

 /**
 * Mixes a sound down into multiple files.
@@ -91,13 +93,15 @@ extern AUD_API const char* AUD_mixdown(AUD_Sound* sound, unsigned int start, uns
 * \param bitrate The bitrate for encoding.
 * \param callback A callback function that is called periodically during mixdown, reporting progress if length > 0. Can be NULL.
 * \param data Pass through parameter that is passed to the callback.
- * \return An error message or NULL in case of success.
+ * \param error String buffer to copy the error message to in case of failure.
+ * \param errorsize The size of the error buffer.
+ * \return Whether or not the operation succeeded.
 */
-extern AUD_API const char* AUD_mixdown_per_channel(AUD_Sound* sound, unsigned int start, unsigned int length,
+extern AUD_API int AUD_mixdown_per_channel(AUD_Sound* sound, unsigned int start, unsigned int length,
 										   unsigned int buffersize, const char* filename,
 										   AUD_DeviceSpecs specs, AUD_Container format,
 										   AUD_Codec codec, unsigned int bitrate,
-										   void(*callback)(float, void*), void* data);
+										   void(*callback)(float, void*), void* data, char* error, size_t errorsize);

 /**
 * Opens a read device and prepares it for mixdown of the sound scene.
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -128,10 +128,6 @@ if(WITH_OPENIMAGEDENOISE)
  )
 endif()

-if(WITH_EXPERIMENTAL_FEATURES)
-  add_definitions(-DWITH_NEW_CURVES_TYPE)
-endif()
-
 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")

 add_dependencies(bf_intern_cycles bf_rna)
--- a/intern/cycles/blender/addon/presets.py
+++ b/intern/cycles/blender/addon/presets.py
@@ -84,10 +84,36 @@ class AddPresetViewportSampling(AddPresetBase, Operator):
    preset_subdir = "cycles/viewport_sampling"


+class AddPresetPerformance(AddPresetBase, Operator):
+    '''Add an Performance Preset'''
+    bl_idname = "render.cycles_performance_preset_add"
+    bl_label = "Add Performance Preset"
+    preset_menu = "CYCLES_PT_performance_presets"
+
+    preset_defines = [
+        "render = bpy.context.scene.render"
+        "cycles = bpy.context.scene.cycles"
+    ]
+
+    preset_values = [
+        "render.threads_mode",
+        "render.use_persistent_data",
+        "cycles.debug_use_spatial_splits",
+        "cycles.debug_use_compact_bvh",
+        "cycles.debug_use_hair_bvh",
+        "cycles.debug_bvh_time_steps",
+        "cycles.use_auto_tile",
+        "cycles.tile_size",
+    ]
+
+    preset_subdir = "cycles/performance"
+
+
 classes = (
    AddPresetIntegrator,
    AddPresetSampling,
    AddPresetViewportSampling,
+    AddPresetPerformance,
 )


--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -693,7 +693,7 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
    debug_use_compact_bvh: BoolProperty(
        name="Use Compact BVH",
        description="Use compact BVH structure (uses less ram but renders slower)",
-        default=True,
+        default=False,
    )
    debug_bvh_time_steps: IntProperty(
        name="BVH Time Steps",
@@ -1560,7 +1560,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
                if sys.platform.startswith("win"):
                    col.label(text="and Windows driver version 101.1660 or newer", icon='BLANK1')
                elif sys.platform.startswith("linux"):
-                    col.label(text="and Linux driver version xx.xx.20066 or newer", icon='BLANK1')
+                    col.label(text="and Linux driver version xx.xx.23570 or newer", icon='BLANK1')
            elif device_type == 'METAL':
                col.label(text="Requires Apple Silicon with macOS 12.2 or newer", icon='BLANK1')
                col.label(text="or AMD with macOS 12.3 or newer", icon='BLANK1')
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -43,6 +43,12 @@ class CYCLES_PT_integrator_presets(CyclesPresetPanel):
    preset_add_operator = "render.cycles_integrator_preset_add"


+class CYCLES_PT_performance_presets(CyclesPresetPanel):
+    bl_label = "Performance Presets"
+    preset_subdir = "cycles/performance"
+    preset_add_operator = "render.cycles_performance_preset_add"
+
+
 class CyclesButtonsPanel:
    bl_space_type = "PROPERTIES"
    bl_region_type = "WINDOW"
@@ -624,6 +630,9 @@ class CYCLES_RENDER_PT_performance(CyclesButtonsPanel, Panel):
    bl_label = "Performance"
    bl_options = {'DEFAULT_CLOSED'}

+    def draw_header_preset(self, context):
+        CYCLES_PT_performance_presets.draw_panel_header(self.layout)
+
    def draw(self, context):
        pass

@@ -943,6 +952,8 @@ class CYCLES_CAMERA_PT_dof(CyclesButtonsPanel, Panel):

        col = split.column()
        col.prop(dof, "focus_object", text="Focus Object")
+        if dof.focus_object and dof.focus_object.type == 'ARMATURE':
+            col.prop_search(dof, "focus_subtarget", dof.focus_object.data, "bones", text="Focus Bone")

        sub = col.row()
        sub.active = dof.focus_object is None
@@ -1202,7 +1213,7 @@ class CYCLES_OBJECT_PT_lightgroup(CyclesButtonsPanel, Panel):
        sub.prop_search(ob, "lightgroup", view_layer, "lightgroups", text="Light Group", results_are_suggestions=True)

        sub = row.column(align=True)
-        sub.active = bool(ob.lightgroup) and not any(lg.name == ob.lightgroup for lg in view_layer.lightgroups)
+        sub.enabled = bool(ob.lightgroup) and not any(lg.name == ob.lightgroup for lg in view_layer.lightgroups)
        sub.operator("scene.view_layer_add_lightgroup", icon='ADD', text="").name = ob.lightgroup


@@ -1640,7 +1651,7 @@ class CYCLES_WORLD_PT_settings_light_group(CyclesButtonsPanel, Panel):
        )

        sub = row.column(align=True)
-        sub.active = bool(world.lightgroup) and not any(lg.name == world.lightgroup for lg in view_layer.lightgroups)
+        sub.enabled = bool(world.lightgroup) and not any(lg.name == world.lightgroup for lg in view_layer.lightgroups)
        sub.operator("scene.view_layer_add_lightgroup", icon='ADD', text="").name = world.lightgroup


@@ -2269,6 +2280,7 @@ classes = (
    CYCLES_PT_sampling_presets,
    CYCLES_PT_viewport_sampling_presets,
    CYCLES_PT_integrator_presets,
+    CYCLES_PT_performance_presets,
    CYCLES_RENDER_PT_sampling,
    CYCLES_RENDER_PT_sampling_viewport,
    CYCLES_RENDER_PT_sampling_viewport_denoise,
--- a/intern/cycles/blender/camera.cpp
+++ b/intern/cycles/blender/camera.cpp
@@ -143,11 +143,20 @@ static float blender_camera_focal_distance(BL::RenderEngine &b_engine,
  if (!b_dof_object)
    return b_camera.dof().focus_distance();

+  Transform dofmat = get_transform(b_dof_object.matrix_world());
+
+  string focus_subtarget = b_camera.dof().focus_subtarget();
+  if (b_dof_object.pose() && !focus_subtarget.empty()) {
+    BL::PoseBone b_bone = b_dof_object.pose().bones[focus_subtarget];
+    if (b_bone) {
+      dofmat = dofmat * get_transform(b_bone.matrix());
+    }
+  }
+
  /* for dof object, return distance along camera Z direction */
  BL::Array<float, 16> b_ob_matrix;
  b_engine.camera_model_matrix(b_ob, bcam->use_spherical_stereo, b_ob_matrix);
  Transform obmat = transform_clear_scale(get_transform(b_ob_matrix));
-  Transform dofmat = get_transform(b_dof_object.matrix_world());
  float3 view_dir = normalize(transform_get_column(&obmat, 2));
  float3 dof_dir = transform_get_column(&obmat, 3) - transform_get_column(&dofmat, 3);
  return fabsf(dot(view_dir, dof_dir));
--- a/intern/cycles/blender/curves.cpp
+++ b/intern/cycles/blender/curves.cpp
@@ -613,8 +613,6 @@ void BlenderSync::sync_particle_hair(
  }
 }

-#ifdef WITH_NEW_CURVES_TYPE
-
 static std::optional<BL::FloatAttribute> find_curves_radius_attribute(BL::Curves b_curves)
 {
  for (BL::Attribute &b_attribute : b_curves.attributes) {
@@ -632,6 +630,25 @@ static std::optional<BL::FloatAttribute> find_curves_radius_attribute(BL::Curves
  return std::nullopt;
 }

+static BL::FloatVectorAttribute find_curves_position_attribute(BL::Curves b_curves)
+{
+  for (BL::Attribute &b_attribute : b_curves.attributes) {
+    if (b_attribute.name() != "position") {
+      continue;
+    }
+    if (b_attribute.domain() != BL::Attribute::domain_POINT) {
+      continue;
+    }
+    if (b_attribute.data_type() != BL::Attribute::data_type_FLOAT_VECTOR) {
+      continue;
+    }
+    return BL::FloatVectorAttribute{b_attribute};
+  }
+  /* The position attribute must exist. */
+  assert(false);
+  return BL::FloatVectorAttribute{b_curves.attributes[0]};
+}
+
 template<typename TypeInCycles, typename GetValueAtIndex>
 static void fill_generic_attribute(BL::Curves &b_curves,
                                   TypeInCycles *data,
@@ -795,16 +812,16 @@ static void attr_create_generic(Scene *scene,
  }
 }

-static float4 hair_point_as_float4(BL::Curves b_curves,
+static float4 hair_point_as_float4(BL::FloatVectorAttribute b_attr_position,
                                   std::optional<BL::FloatAttribute> b_attr_radius,
                                   const int index)
 {
-  float4 mP = float3_to_float4(get_float3(b_curves.position_data[index].vector()));
+  float4 mP = float3_to_float4(get_float3(b_attr_position.data[index].vector()));
  mP.w = b_attr_radius ? b_attr_radius->data[index].value() : 0.0f;
  return mP;
 }

-static float4 interpolate_hair_points(BL::Curves b_curves,
+static float4 interpolate_hair_points(BL::FloatVectorAttribute b_attr_position,
                                      std::optional<BL::FloatAttribute> b_attr_radius,
                                      const int first_point_index,
                                      const int num_points,
@@ -814,8 +831,8 @@ static float4 interpolate_hair_points(BL::Curves b_curves,
  const int point_a = clamp((int)curve_t, 0, num_points - 1);
  const int point_b = min(point_a + 1, num_points - 1);
  const float t = curve_t - (float)point_a;
-  return lerp(hair_point_as_float4(b_curves, b_attr_radius, first_point_index + point_a),
-              hair_point_as_float4(b_curves, b_attr_radius, first_point_index + point_b),
+  return lerp(hair_point_as_float4(b_attr_position, b_attr_radius, first_point_index + point_a),
+              hair_point_as_float4(b_attr_position, b_attr_radius, first_point_index + point_b),
              t);
 }

@@ -848,6 +865,7 @@ static void export_hair_curves(Scene *scene,

  hair->reserve_curves(num_curves, num_keys);

+  BL::FloatVectorAttribute b_attr_position = find_curves_position_attribute(b_curves);
  std::optional<BL::FloatAttribute> b_attr_radius = find_curves_radius_attribute(b_curves);

  /* Export curves and points. */
@@ -866,9 +884,9 @@ static void export_hair_curves(Scene *scene,

    /* Position and radius. */
    for (int i = 0; i < num_points; i++) {
-      const float3 co = get_float3(b_curves.position_data[first_point_index + i].vector());
+      const float3 co = get_float3(b_attr_position.data[first_point_index + i].vector());
      const float radius = b_attr_radius ? b_attr_radius->data[first_point_index + i].value() :
-                                           0.0f;
+                                           0.005f;
      hair->add_curve_key(co, radius);

      if (attr_intercept) {
@@ -923,6 +941,7 @@ static void export_hair_curves_motion(Hair *hair, BL::Curves b_curves, int motio
  int num_motion_keys = 0;
  int curve_index = 0;

+  BL::FloatVectorAttribute b_attr_position = find_curves_position_attribute(b_curves);
  std::optional<BL::FloatAttribute> b_attr_radius = find_curves_radius_attribute(b_curves);

  for (int i = 0; i < num_curves; i++) {
@@ -938,7 +957,7 @@ static void export_hair_curves_motion(Hair *hair, BL::Curves b_curves, int motio
        int point_index = first_point_index + i;

        if (point_index < num_keys) {
-          mP[num_motion_keys] = hair_point_as_float4(b_curves, b_attr_radius, point_index);
+          mP[num_motion_keys] = hair_point_as_float4(b_attr_position, b_attr_radius, point_index);
          num_motion_keys++;

          if (!have_motion) {
@@ -958,7 +977,7 @@ static void export_hair_curves_motion(Hair *hair, BL::Curves b_curves, int motio
      for (int i = 0; i < curve.num_keys; i++) {
        const float step = i * step_size;
        mP[num_motion_keys] = interpolate_hair_points(
-            b_curves, b_attr_radius, first_point_index, num_points, step);
+            b_attr_position, b_attr_radius, first_point_index, num_points, step);
        num_motion_keys++;
      }
      have_motion = true;
@@ -990,15 +1009,6 @@ void BlenderSync::sync_hair(Hair *hair, BObjectInfo &b_ob_info, bool motion, int
    export_hair_curves(scene, hair, b_curves, need_motion, motion_scale);
  }
 }
-#else
-void BlenderSync::sync_hair(Hair *hair, BObjectInfo &b_ob_info, bool motion, int motion_step)
-{
-  (void)hair;
-  (void)b_ob_info;
-  (void)motion;
-  (void)motion_step;
-}
-#endif

 void BlenderSync::sync_hair(BL::Depsgraph b_depsgraph, BObjectInfo &b_ob_info, Hair *hair)
 {
@@ -1010,14 +1020,11 @@ void BlenderSync::sync_hair(BL::Depsgraph b_depsgraph, BObjectInfo &b_ob_info, H
  new_hair.set_used_shaders(used_shaders);

  if (view_layer.use_hair) {
-#ifdef WITH_NEW_CURVES_TYPE
    if (b_ob_info.object_data.is_a(&RNA_Curves)) {
      /* Hair object. */
      sync_hair(&new_hair, b_ob_info, false);
    }
-    else
-#endif
-    {
+    else {
      /* Particle hair. */
      bool need_undeformed = new_hair.need_attribute(scene, ATTR_STD_GENERATED);
      BL::Mesh b_mesh = object_to_mesh(
@@ -1064,15 +1071,12 @@ void BlenderSync::sync_hair_motion(BL::Depsgraph b_depsgraph,

  /* Export deformed coordinates. */
  if (ccl::BKE_object_is_deform_modified(b_ob_info, b_scene, preview)) {
-#ifdef WITH_NEW_CURVES_TYPE
    if (b_ob_info.object_data.is_a(&RNA_Curves)) {
      /* Hair object. */
      sync_hair(hair, b_ob_info, true, motion_step);
      return;
    }
-    else
-#endif
-    {
+    else {
      /* Particle hair. */
      BL::Mesh b_mesh = object_to_mesh(
          b_data, b_ob_info, b_depsgraph, false, Mesh::SUBDIVISION_NONE);
--- a/intern/cycles/blender/geometry.cpp
+++ b/intern/cycles/blender/geometry.cpp
@@ -18,11 +18,7 @@ CCL_NAMESPACE_BEGIN

 static Geometry::Type determine_geom_type(BObjectInfo &b_ob_info, bool use_particle_hair)
 {
-#ifdef WITH_NEW_CURVES_TYPE
  if (b_ob_info.object_data.is_a(&RNA_Curves) || use_particle_hair) {
-#else
-  if (use_particle_hair) {
-#endif
    return Geometry::HAIR;
  }

@@ -217,11 +213,7 @@ void BlenderSync::sync_geometry_motion(BL::Depsgraph &b_depsgraph,
    if (progress.get_cancel())
      return;

-#ifdef WITH_NEW_CURVES_TYPE
    if (b_ob_info.object_data.is_a(&RNA_Curves) || use_particle_hair) {
-#else
-    if (use_particle_hair) {
-#endif
      Hair *hair = static_cast<Hair *>(geom);
      sync_hair_motion(b_depsgraph, b_ob_info, hair, motion_step);
    }
--- a/intern/cycles/blender/pointcloud.cpp
+++ b/intern/cycles/blender/pointcloud.cpp
@@ -1,8 +1,10 @@
 /* SPDX-License-Identifier: Apache-2.0
 * Copyright 2011-2022 Blender Foundation */

-#include "scene/pointcloud.h"
+#include <optional>
+
 #include "scene/attribute.h"
+#include "scene/pointcloud.h"
 #include "scene/scene.h"

 #include "blender/sync.h"
@@ -138,6 +140,36 @@ static void copy_attributes(PointCloud *pointcloud,
  }
 }

+static std::optional<BL::FloatAttribute> find_radius_attribute(BL::PointCloud b_pointcloud)
+{
+  for (BL::Attribute &b_attribute : b_pointcloud.attributes) {
+    if (b_attribute.name() != "radius") {
+      continue;
+    }
+    if (b_attribute.data_type() != BL::Attribute::data_type_FLOAT) {
+      continue;
+    }
+    return BL::FloatAttribute{b_attribute};
+  }
+  return std::nullopt;
+}
+
+static BL::FloatVectorAttribute find_position_attribute(BL::PointCloud b_pointcloud)
+{
+  for (BL::Attribute &b_attribute : b_pointcloud.attributes) {
+    if (b_attribute.name() != "position") {
+      continue;
+    }
+    if (b_attribute.data_type() != BL::Attribute::data_type_FLOAT_VECTOR) {
+      continue;
+    }
+    return BL::FloatVectorAttribute{b_attribute};
+  }
+  /* The position attribute must exist. */
+  assert(false);
+  return BL::FloatVectorAttribute{b_pointcloud.attributes[0]};
+}
+
 static void export_pointcloud(Scene *scene,
                              PointCloud *pointcloud,
                              BL::PointCloud b_pointcloud,
@@ -156,18 +188,18 @@ static void export_pointcloud(Scene *scene,
  const int num_points = b_pointcloud.points.length();
  pointcloud->reserve(num_points);

+  BL::FloatVectorAttribute b_attr_position = find_position_attribute(b_pointcloud);
+  std::optional<BL::FloatAttribute> b_attr_radius = find_radius_attribute(b_pointcloud);
+
  /* Export points. */
-  BL::PointCloud::points_iterator b_point_iter;
-  for (b_pointcloud.points.begin(b_point_iter); b_point_iter != b_pointcloud.points.end();
-       ++b_point_iter) {
-    BL::Point b_point = *b_point_iter;
-    const float3 co = get_float3(b_point.co());
-    const float radius = b_point.radius();
+  for (int i = 0; i < num_points; i++) {
+    const float3 co = get_float3(b_attr_position.data[i].vector());
+    const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.0f;
    pointcloud->add_point(co, radius);

    /* Random number per point. */
    if (attr_random != NULL) {
-      attr_random->add(hash_uint2_to_float(b_point.index(), 0));
+      attr_random->add(hash_uint2_to_float(i, 0));
    }
  }

@@ -195,14 +227,15 @@ static void export_pointcloud_motion(PointCloud *pointcloud,
  int num_motion_points = 0;
  const array<float3> &pointcloud_points = pointcloud->get_points();

-  BL::PointCloud::points_iterator b_point_iter;
-  for (b_pointcloud.points.begin(b_point_iter); b_point_iter != b_pointcloud.points.end();
-       ++b_point_iter) {
-    BL::Point b_point = *b_point_iter;
+  BL::FloatVectorAttribute b_attr_position = find_position_attribute(b_pointcloud);
+  std::optional<BL::FloatAttribute> b_attr_radius = find_radius_attribute(b_pointcloud);

+  for (int i = 0; i < num_points; i++) {
    if (num_motion_points < num_points) {
-      float3 P = get_float3(b_point.co());
-      P.w = b_point.radius();
+      const float3 co = get_float3(b_attr_position.data[i].vector());
+      const float radius = b_attr_radius ? b_attr_radius->data[i].value() : 0.0f;
+      float3 P = co;
+      P.w = radius;
      mP[num_motion_points] = P;
      have_motion = have_motion || (P != pointcloud_points[num_motion_points]);
      num_motion_points++;
--- a/intern/cycles/bvh/params.h
+++ b/intern/cycles/bvh/params.h
@@ -129,7 +129,7 @@ class BVHParams {

    top_level = false;
    bvh_layout = BVH_LAYOUT_BVH2;
-    use_compact_structure = true;
+    use_compact_structure = false;
    use_unaligned_nodes = false;

    num_motion_curve_steps = 0;
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -197,7 +197,7 @@ void CPUDevice::const_copy_to(const char *name, void *host, size_t size)

    // Update scene handle (since it is different for each device on multi devices)
    KernelData *const data = (KernelData *)host;
-    data->bvh.scene = embree_scene;
+    data->device_bvh = embree_scene;
  }
 #endif
  kernel_const_copy(&kernel_globals, name, host, size);
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -29,6 +29,7 @@ class DeviceQueue;
 class Progress;
 class CPUKernels;
 class CPUKernelThreadGlobals;
+class Scene;

 /* Device Types */

@@ -186,6 +187,11 @@ class Device {
    return 0;
  }

+  /* Called after kernel texture setup, and prior to integrator state setup. */
+  virtual void optimize_for_scene(Scene * /*scene*/)
+  {
+  }
+
  virtual bool is_resident(device_ptr /*key*/, Device *sub_device)
  {
    /* Memory is always resident if this is not a multi device, regardless of whether the pointer
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -75,7 +75,8 @@ class MetalDevice : public Device {
  std::vector<id<MTLTexture>> texture_slot_map;

  bool use_metalrt = false;
-  bool use_function_specialisation = false;
+  MetalPipelineType kernel_specialization_level = PSO_GENERIC;
+  std::atomic_bool async_compile_and_load = false;

  virtual BVHLayoutMask get_bvh_layout_mask() const override;

@@ -91,9 +92,7 @@ class MetalDevice : public Device {

  bool use_adaptive_compilation();

-  string get_source(const uint kernel_features);
-
-  string compile_kernel(const uint kernel_features, const char *name);
+  void make_source(MetalPipelineType pso_type, const uint kernel_features);

  virtual bool load_kernels(const uint kernel_features) override;

@@ -111,7 +110,9 @@ class MetalDevice : public Device {

  virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;

-  id<MTLLibrary> compile(string const &source);
+  virtual void optimize_for_scene(Scene *scene) override;
+
+  bool compile_and_load(MetalPipelineType pso_type);

  /* ------------------------------------------------------------------ */
  /* low-level memory management */
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -6,6 +6,8 @@
 #  include "device/metal/device_impl.h"
 #  include "device/metal/device.h"

+#  include "scene/scene.h"
+
 #  include "util/debug.h"
 #  include "util/md5.h"
 #  include "util/path.h"
@@ -78,6 +80,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    case METAL_GPU_APPLE: {
      max_threads_per_threadgroup = 512;
      use_metalrt = info.use_metalrt;
+
+      /* Specialize the intersection kernels on Apple GPUs by default as these can be built very
+       * quickly. */
+      kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
      break;
    }
  }
@@ -90,6 +96,13 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
    capture_enabled = true;
  }

+  if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
+    kernel_specialization_level = (MetalPipelineType)atoi(envstr);
+  }
+  metal_printf("kernel_specialization_level = %s\n",
+               kernel_type_as_string(
+                   (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
+
  MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
  arg_desc_params.dataType = MTLDataTypePointer;
  arg_desc_params.access = MTLArgumentAccessReadOnly;
@@ -209,61 +222,86 @@ bool MetalDevice::use_adaptive_compilation()
  return DebugFlags().metal.adaptive_compile;
 }

-string MetalDevice::get_source(const uint kernel_features)
+void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
 {
-  string build_options;
-
+  string global_defines;
  if (use_adaptive_compilation()) {
-    build_options += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
+    global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
  }

  if (use_metalrt) {
-    build_options += "-D__METALRT__ ";
+    global_defines += "#define __METALRT__\n";
    if (motion_blur) {
-      build_options += "-D__METALRT_MOTION__ ";
+      global_defines += "#define __METALRT_MOTION__\n";
    }
  }

 #  ifdef WITH_CYCLES_DEBUG
-  build_options += "-D__KERNEL_DEBUG__ ";
+  global_defines += "#define __KERNEL_DEBUG__\n";
 #  endif

  switch (device_vendor) {
    default:
      break;
    case METAL_GPU_INTEL:
-      build_options += "-D__KERNEL_METAL_INTEL__ ";
+      global_defines += "#define __KERNEL_METAL_INTEL__\n";
      break;
    case METAL_GPU_AMD:
-      build_options += "-D__KERNEL_METAL_AMD__ ";
+      global_defines += "#define __KERNEL_METAL_AMD__\n";
      break;
    case METAL_GPU_APPLE:
-      build_options += "-D__KERNEL_METAL_APPLE__ ";
+      global_defines += "#define __KERNEL_METAL_APPLE__\n";
      break;
  }

-  /* reformat -D defines list into compilable form */
-  vector<string> components;
-  string_replace(build_options, "-D", "");
-  string_split(components, build_options, " ");
-
-  string globalDefines;
-  for (const string &component : components) {
-    vector<string> assignments;
-    string_split(assignments, component, "=");
-    if (assignments.size() == 2)
-      globalDefines += string_printf(
-          "#define %s %s\n", assignments[0].c_str(), assignments[1].c_str());
-    else
-      globalDefines += string_printf("#define %s\n", assignments[0].c_str());
-  }
-
-  string source = globalDefines + "\n#include \"kernel/device/metal/kernel.metal\"\n";
+  string &source = this->source[pso_type];
+  source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
  source = path_source_replace_includes(source, path_get("source"));

-  metal_printf("Global defines:\n%s\n", globalDefines.c_str());
+  /* Perform any required specialization on the source.
+   * With Metal function constants we can generate a single variant of the kernel source which can
+   * be repeatedly respecialized.
+   */
+  string baked_constants;

-  return source;
+  /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
+   * the same character length. Build a string of all active constant values which is then hashed
+   * in order to identify the PSO.
+   */
+  if (pso_type != PSO_GENERIC) {
+    const double starttime = time_dt();
+
+#  define KERNEL_STRUCT_BEGIN(name, parent) \
+    string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
+
+    /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */
+#  define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+    baked_constants += string(#parent "." #name "=") + \
+                       to_string(_type(launch_params.data.parent.name)) + "\n";
+
+#  include "kernel/data_template.h"
+
+    /* Opt in to all of available specializations. This can be made more granular for the
+     * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
+     * but the overhead should be negligible as these are very quick to (re)build and aren't
+     * serialized to disk via MTLBinaryArchives.
+     */
+    global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
+
+    metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
+  }
+
+  source = global_defines + source;
+  metal_printf("================\n%s================\n\%s================\n",
+               global_defines.c_str(),
+               baked_constants.c_str());
+
+  /* Generate an MD5 from the source and include any baked constants. This is used when caching
+   * PSOs. */
+  MD5Hash md5;
+  md5.append(baked_constants);
+  md5.append(source);
+  source_md5[pso_type] = md5.get_hex();
 }

 bool MetalDevice::load_kernels(const uint _kernel_features)
@@ -279,28 +317,22 @@ bool MetalDevice::load_kernels(const uint _kernel_features)
   * active, but may still need to be rendered without motion blur if that isn't active as well. */
  motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;

-  source[PSO_GENERIC] = get_source(kernel_features);
-
-  const double starttime = time_dt();
-
-  mtlLibrary[PSO_GENERIC] = compile(source[PSO_GENERIC]);
-
-  metal_printf("Front-end compilation finished in %.1f seconds (generic)\n",
-               time_dt() - starttime);
-
-  MD5Hash md5;
-  md5.append(source[PSO_GENERIC]);
-  source_md5[PSO_GENERIC] = md5.get_hex();
-
-  bool result = MetalDeviceKernels::load(this, false);
+  bool result = compile_and_load(PSO_GENERIC);

  reserve_local_memory(kernel_features);
-
  return result;
 }

-id<MTLLibrary> MetalDevice::compile(string const &source)
+bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
 {
+  make_source(pso_type, kernel_features);
+
+  if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) {
+    /* We already have a full set of matching pipelines which are cached or queued. */
+    metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type));
+    return true;
+  }
+
  MTLCompileOptions *options = [[MTLCompileOptions alloc] init];

  options.fastMathEnabled = YES;
@@ -308,19 +340,30 @@ id<MTLLibrary> MetalDevice::compile(string const &source)
    options.languageVersion = MTLLanguageVersion2_4;
  }

-  NSError *error = NULL;
-  id<MTLLibrary> mtlLibrary = [mtlDevice newLibraryWithSource:@(source.c_str())
-                                                      options:options
-                                                        error:&error];
+  if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
+    path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
+                    source[pso_type]);
+  }

-  if (!mtlLibrary) {
+  const double starttime = time_dt();
+
+  NSError *error = NULL;
+  mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str())
+                                                 options:options
+                                                   error:&error];
+
+  if (!mtlLibrary[pso_type]) {
    NSString *err = [error localizedDescription];
    set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
  }

+  metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
+               time_dt() - starttime,
+               kernel_type_as_string(pso_type));
+
  [options release];

-  return mtlLibrary;
+  return MetalDeviceKernels::load(this, pso_type);
 }

 void MetalDevice::reserve_local_memory(const uint kernel_features)
@@ -627,6 +670,58 @@ device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, siz
  return 0;
 }

+void MetalDevice::optimize_for_scene(Scene *scene)
+{
+  MetalPipelineType specialization_level = kernel_specialization_level;
+
+  if (specialization_level < PSO_SPECIALIZED_INTERSECT) {
+    return;
+  }
+
+  /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them
+   * synchronously. */
+  compile_and_load(PSO_SPECIALIZED_INTERSECT);
+
+  if (specialization_level < PSO_SPECIALIZED_SHADE) {
+    return;
+  }
+  if (!scene->params.background) {
+    /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to
+     * build. */
+    return;
+  }
+
+  /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and
+   * only if there isn't an existing load in flight.
+   */
+  auto specialize_shade_fn = ^() {
+    compile_and_load(PSO_SPECIALIZED_SHADE);
+    async_compile_and_load = false;
+  };
+
+  bool async_specialize_shade = true;
+
+  /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
+  if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
+    async_specialize_shade = false;
+  }
+
+  if (async_specialize_shade) {
+    if (!async_compile_and_load) {
+      async_compile_and_load = true;
+      dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
+                     specialize_shade_fn);
+    }
+    else {
+      metal_printf(
+          "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n");
+    }
+  }
+  else {
+    specialize_shade_fn();
+  }
+}
+
 void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
 {
  if (strcmp(name, "data") == 0) {
@@ -652,7 +747,7 @@ void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
  /* Update data storage pointers in launch parameters. */
  if (strcmp(name, "integrator_state") == 0) {
    /* IntegratorStateGPU is contiguous pointers */
-    const size_t pointer_block_size = sizeof(IntegratorStateGPU);
+    const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
    update_launch_pointers(
        offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
  }
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -31,7 +31,7 @@ enum {
 enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };

 /* Pipeline State Object types */
-enum {
+enum MetalPipelineType {
  /* A kernel that can be used with all scenes, supporting all features.
   * It is slow to compile, but only needs to be compiled once and is then
   * cached for future render sessions. This allows a render to get underway
@@ -39,28 +39,33 @@ enum {
   */
  PSO_GENERIC,

-  /* A kernel that is relatively quick to compile, but is specialized for the
-   * scene being rendered. It only contains the functionality and even baked in
-   * constants for values that means it needs to be recompiled whenever a
-   * dependent setting is changed. The render performance of this kernel is
-   * significantly faster though, and justifies the extra compile time.
+  /* A intersection kernel that is very quick to specialize and results in faster intersection
+   * kernel performance. It uses Metal function constants to replace several KernelData variables
+   * with fixed constants.
   */
-  /* METAL_WIP: This isn't used and will require more changes to enable. */
-  PSO_SPECIALISED,
+  PSO_SPECIALIZED_INTERSECT,
+
+  /* A shading kernel that is slow to specialize, but results in faster shading kernel performance
+   * rendered. It uses Metal function constants to replace several KernelData variables with fixed
+   * constants and short-circuit all unused SVM node case handlers.
+   */
+  PSO_SPECIALIZED_SHADE,

  PSO_NUM
 };

-const char *kernel_type_as_string(int kernel_type);
+const char *kernel_type_as_string(MetalPipelineType pso_type);

 struct MetalKernelPipeline {

  void compile();

  id<MTLLibrary> mtlLibrary = nil;
-  bool scene_specialized;
+  MetalPipelineType pso_type;
  string source_md5;
+  size_t usage_count = 0;

+  KernelData kernel_data_;
  bool use_metalrt;
  bool metalrt_hair;
  bool metalrt_hair_thick;
@@ -75,6 +80,8 @@ struct MetalKernelPipeline {
  id<MTLComputePipelineState> pipeline = nil;
  int num_threads_per_block = 0;

+  bool should_use_binary_archive() const;
+
  string error_str;

  API_AVAILABLE(macos(11.0))
@@ -85,7 +92,8 @@ struct MetalKernelPipeline {
 /* Cache of Metal kernels for each DeviceKernel. */
 namespace MetalDeviceKernels {

-bool load(MetalDevice *device, bool scene_specialized);
+bool should_load_kernels(MetalDevice *device, MetalPipelineType pso_type);
+bool load(MetalDevice *device, MetalPipelineType pso_type);
 const MetalKernelPipeline *get_best_pipeline(const MetalDevice *device, DeviceKernel kernel);

 } /* namespace MetalDeviceKernels */
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -5,6 +5,7 @@

 #  include "device/metal/kernel.h"
 #  include "device/metal/device_impl.h"
+#  include "kernel/device/metal/function_constants.h"
 #  include "util/md5.h"
 #  include "util/path.h"
 #  include "util/tbb.h"
@@ -16,13 +17,15 @@ CCL_NAMESPACE_BEGIN
 /* limit to 2 MTLCompiler instances */
 int max_mtlcompiler_threads = 2;

-const char *kernel_type_as_string(int kernel_type)
+const char *kernel_type_as_string(MetalPipelineType pso_type)
 {
-  switch (kernel_type) {
+  switch (pso_type) {
    case PSO_GENERIC:
      return "PSO_GENERIC";
-    case PSO_SPECIALISED:
-      return "PSO_SPECIALISED";
+    case PSO_SPECIALIZED_INTERSECT:
+      return "PSO_SPECIALIZED_INTERSECT";
+    case PSO_SPECIALIZED_SHADE:
+      return "PSO_SPECIALIZED_SHADE";
    default:
      assert(0);
  }
@@ -50,7 +53,11 @@ struct ShaderCache {

  /* Non-blocking request for a kernel, optionally specialized to the scene being rendered by
   * device. */
-  void load_kernel(DeviceKernel kernel, MetalDevice *device, bool scene_specialized);
+  void load_kernel(DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
+
+  bool should_load_kernel(DeviceKernel device_kernel,
+                          MetalDevice *device,
+                          MetalPipelineType pso_type);

  void wait_for_all();

@@ -139,9 +146,53 @@ void ShaderCache::compile_thread_func(int thread_index)
  }
 }

+bool ShaderCache::should_load_kernel(DeviceKernel device_kernel,
+                                     MetalDevice *device,
+                                     MetalPipelineType pso_type)
+{
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+    /* Skip megakernel. */
+    return false;
+  }
+
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
+    if ((device->kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) {
+      /* Skip shade_surface_raytrace kernel if the scene doesn't require it. */
+      return false;
+    }
+  }
+
+  if (pso_type != PSO_GENERIC) {
+    /* Only specialize kernels where it can make an impact. */
+    if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+        device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      return false;
+    }
+
+    /* Only specialize shading / intersection kernels as requested. */
+    bool is_shade_kernel = (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
+    if (is_shade_pso != is_shade_kernel) {
+      return false;
+    }
+  }
+
+  {
+    /* check whether the kernel has already been requested / cached */
+    thread_scoped_lock lock(cache_mutex);
+    for (auto &pipeline : pipelines[device_kernel]) {
+      if (pipeline->source_md5 == device->source_md5[pso_type]) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 void ShaderCache::load_kernel(DeviceKernel device_kernel,
                              MetalDevice *device,
-                              bool scene_specialized)
+                              MetalPipelineType pso_type)
 {
  {
    /* create compiler threads on first run */
@@ -154,52 +205,21 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,
    }
  }

-  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-    /* skip megakernel */
+  if (!should_load_kernel(device_kernel, device, pso_type)) {
    return;
  }

-  if (scene_specialized) {
-    /* Only specialize kernels where it can make an impact. */
-    if (device_kernel < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
-        device_kernel > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-      return;
-    }
-  }
-
-  {
-    /* check whether the kernel has already been requested / cached */
-    thread_scoped_lock lock(cache_mutex);
-    for (auto &pipeline : pipelines[device_kernel]) {
-      if (scene_specialized) {
-        if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
-          /* we already requested a pipeline that is specialized for this kernel data */
-          metal_printf("Specialized kernel already requested (%s)\n",
-                       device_kernel_as_string(device_kernel));
-          return;
-        }
-      }
-      else {
-        if (pipeline->source_md5 == device->source_md5[PSO_GENERIC]) {
-          /* we already requested a generic pipeline for this kernel */
-          metal_printf("Generic kernel already requested (%s)\n",
-                       device_kernel_as_string(device_kernel));
-          return;
-        }
-      }
-    }
-  }
-
  incomplete_requests++;

  PipelineRequest request;
  request.pipeline = new MetalKernelPipeline;
-  request.pipeline->scene_specialized = scene_specialized;
+  memcpy(&request.pipeline->kernel_data_,
+         &device->launch_params.data,
+         sizeof(request.pipeline->kernel_data_));
+  request.pipeline->pso_type = pso_type;
  request.pipeline->mtlDevice = mtlDevice;
-  request.pipeline->source_md5 =
-      device->source_md5[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
-  request.pipeline->mtlLibrary =
-      device->mtlLibrary[scene_specialized ? PSO_SPECIALISED : PSO_GENERIC];
+  request.pipeline->source_md5 = device->source_md5[pso_type];
+  request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
  request.pipeline->device_kernel = device_kernel;
  request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;

@@ -214,7 +234,24 @@ void ShaderCache::load_kernel(DeviceKernel device_kernel,

  {
    thread_scoped_lock lock(cache_mutex);
-    pipelines[device_kernel].push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
+    auto &collection = pipelines[device_kernel];
+
+    /* Cache up to 3 kernel variants with the same pso_type, purging oldest first. */
+    int max_entries_of_same_pso_type = 3;
+    for (int i = (int)collection.size() - 1; i >= 0; i--) {
+      if (collection[i]->pso_type == pso_type) {
+        max_entries_of_same_pso_type -= 1;
+        if (max_entries_of_same_pso_type == 0) {
+          metal_printf("Purging oldest %s:%s kernel from ShaderCache\n",
+                       kernel_type_as_string(pso_type),
+                       device_kernel_as_string(device_kernel));
+          collection.erase(collection.begin() + i);
+          break;
+        }
+      }
+    }
+
+    collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
    request_queue.push_back(request);
  }
  cond_var.notify_one();
@@ -248,8 +285,9 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
      continue;
    }

-    if (pipeline->scene_specialized) {
-      if (pipeline->source_md5 == device->source_md5[PSO_SPECIALISED]) {
+    if (pipeline->pso_type != PSO_GENERIC) {
+      if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
+          pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
        best_pipeline = pipeline.get();
      }
    }
@@ -258,13 +296,65 @@ MetalKernelPipeline *ShaderCache::get_best_pipeline(DeviceKernel kernel, const M
    }
  }

+  if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) {
+    metal_printf("Swapping in %s version of %s\n",
+                 kernel_type_as_string(best_pipeline->pso_type),
+                 device_kernel_as_string(kernel));
+  }
+  best_pipeline->usage_count += 1;
+
  return best_pipeline;
 }

+bool MetalKernelPipeline::should_use_binary_archive() const
+{
+  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
+    if (atoi(str) != 0) {
+      /* Don't archive if we have opted out by env var. */
+      return false;
+    }
+  }
+
+  if (pso_type == PSO_GENERIC) {
+    /* Archive the generic kernels. */
+    return true;
+  }
+
+  if (device_kernel >= DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND &&
+      device_kernel <= DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+    /* Archive all shade kernels - they take a long time to compile. */
+    return true;
+  }
+
+  /* The remaining kernels are all fast to compile. They may get cached by the system shader cache,
+   * but will be quick to regenerate if not. */
+  return false;
+}
+
+static MTLFunctionConstantValues *GetConstantValues(KernelData const *data = nullptr)
+{
+  MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues new];
+
+  MTLDataType MTLDataType_int = MTLDataTypeInt;
+  MTLDataType MTLDataType_float = MTLDataTypeFloat;
+  MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
+  KernelData zero_data = {0};
+  if (!data) {
+    data = &zero_data;
+  }
+
+#  define KERNEL_STRUCT_MEMBER(parent, _type, name) \
+    [constant_values setConstantValue:&data->parent.name \
+                                 type:MTLDataType_##_type \
+                              atIndex:KernelData_##parent##_##name];
+
+#  include "kernel/data_template.h"
+
+  return constant_values;
+}
+
 void MetalKernelPipeline::compile()
 {
-  int pso_type = scene_specialized ? PSO_SPECIALISED : PSO_GENERIC;
-
  const std::string function_name = std::string("cycles_metal_") +
                                    device_kernel_as_string(device_kernel);

@@ -281,6 +371,17 @@ void MetalKernelPipeline::compile()
  if (@available(macOS 11.0, *)) {
    MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
    func_desc.name = entryPoint;
+
+    if (pso_type == PSO_SPECIALIZED_SHADE) {
+      func_desc.constantValues = GetConstantValues(&kernel_data_);
+    }
+    else if (pso_type == PSO_SPECIALIZED_INTERSECT) {
+      func_desc.constantValues = GetConstantValues(&kernel_data_);
+    }
+    else {
+      func_desc.constantValues = GetConstantValues();
+    }
+
    function = [mtlLibrary newFunctionWithDescriptor:func_desc error:&error];
  }

@@ -427,10 +528,7 @@ void MetalKernelPipeline::compile()

  MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;

-  bool use_binary_archive = true;
-  if (auto str = getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
-    use_binary_archive = (atoi(str) == 0);
-  }
+  bool use_binary_archive = should_use_binary_archive();

  id<MTLBinaryArchive> archive = nil;
  string metalbin_path;
@@ -608,17 +706,30 @@ void MetalKernelPipeline::compile()
  }
 }

-bool MetalDeviceKernels::load(MetalDevice *device, bool scene_specialized)
+bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
+{
+  const double starttime = time_dt();
+  auto shader_cache = get_shader_cache(device->mtlDevice);
+  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
+    shader_cache->load_kernel((DeviceKernel)i, device, pso_type);
+  }
+
+  shader_cache->wait_for_all();
+  metal_printf("Back-end compilation finished in %.1f seconds (%s)\n",
+               time_dt() - starttime,
+               kernel_type_as_string(pso_type));
+  return true;
+}
+
+bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
 {
  auto shader_cache = get_shader_cache(device->mtlDevice);
  for (int i = 0; i < DEVICE_KERNEL_NUM; i++) {
-    shader_cache->load_kernel((DeviceKernel)i, device, scene_specialized);
+    if (shader_cache->should_load_kernel((DeviceKernel)i, device, pso_type)) {
+      return true;
+    }
  }
-
-  if (!scene_specialized || getenv("CYCLES_METAL_PROFILING")) {
-    shader_cache->wait_for_all();
-  }
-  return true;
+  return false;
 }

 const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(const MetalDevice *device,
--- a/intern/cycles/device/metal/queue.h
+++ b/intern/cycles/device/metal/queue.h
@@ -24,6 +24,7 @@ class MetalDeviceQueue : public DeviceQueue {

  virtual int num_concurrent_states(const size_t) const override;
  virtual int num_concurrent_busy_states() const override;
+  virtual int num_sort_partition_elements() const override;

  virtual void init_execution() override;

--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -293,6 +293,11 @@ int MetalDeviceQueue::num_concurrent_busy_states() const
  return result;
 }

+int MetalDeviceQueue::num_sort_partition_elements() const
+{
+  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
+}
+
 void MetalDeviceQueue::init_execution()
 {
  /* Synchronize all textures and memory copies before executing task. */
@@ -359,7 +364,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
  /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
  /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
  size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
-                                        sizeof(IntegratorStateGPU);
+                                        offsetof(IntegratorStateGPU, sort_partition_divisor);
  size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
  memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
         (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
@@ -416,7 +421,7 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,

  /* this relies on IntegratorStateGPU layout being contiguous device_ptrs  */
  const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
-                                   sizeof(IntegratorStateGPU);
+                                   offsetof(IntegratorStateGPU, sort_partition_divisor);
  for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
    int pointer_index = int(offset / sizeof(device_ptr));
    MetalDevice::MetalMem *mmem = *(
--- a/intern/cycles/device/metal/util.h
+++ b/intern/cycles/device/metal/util.h
@@ -37,6 +37,7 @@ struct MetalInfo {
  static int get_apple_gpu_core_count(id<MTLDevice> device);
  static MetalGPUVendor get_device_vendor(id<MTLDevice> device);
  static AppleGPUArchitecture get_apple_gpu_architecture(id<MTLDevice> device);
+  static int optimal_sort_partition_elements(id<MTLDevice> device);
  static string get_device_name(id<MTLDevice> device);
 };

--- a/intern/cycles/device/metal/util.mm
+++ b/intern/cycles/device/metal/util.mm
@@ -72,6 +72,21 @@ MetalGPUVendor MetalInfo::get_device_vendor(id<MTLDevice> device)
  return METAL_GPU_UNKNOWN;
 }

+int MetalInfo::optimal_sort_partition_elements(id<MTLDevice> device)
+{
+  if (auto str = getenv("CYCLES_METAL_SORT_PARTITION_ELEMENTS")) {
+    return atoi(str);
+  }
+
+  /* On M1 and M2 GPUs, we see better cache utilization if we partition the active indices before
+   * sorting each partition by material. Partitioning into chunks of 65536 elements results in an
+   * overall render time speedup of up to 15%. */
+  if (get_device_vendor(device) == METAL_GPU_APPLE) {
+    return 65536;
+  }
+  return 0;
+}
+
 vector<id<MTLDevice>> const &MetalInfo::get_usable_devices()
 {
  static vector<id<MTLDevice>> usable_devices;
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -2047,7 +2047,7 @@ void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)

    /* Update traversable handle (since it is different for each device on multi devices). */
    KernelData *const data = (KernelData *)host;
-    *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
+    *(OptixTraversableHandle *)&data->device_bvh = tlas_handle;

    update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
    return;
--- a/intern/cycles/device/queue.h
+++ b/intern/cycles/device/queue.h
@@ -105,6 +105,13 @@ class DeviceQueue {
   * value. */
  virtual int num_concurrent_busy_states() const = 0;

+  /* Number of elements in a partition of sorted shaders, that improves memory locality of
+   * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
+  virtual int num_sort_partition_elements() const
+  {
+    return 65536;
+  }
+
  /* Initialize execution of kernels on this queue.
   *
   * Will, for example, load all data required by the kernels from Device to global or path state.
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -373,7 +373,7 @@ void PathTrace::path_trace(RenderWork &render_work)
    work_balance_infos_[i].time_spent += work_time;
    work_balance_infos_[i].occupancy = statistics.occupancy;

-    VLOG_WORK << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
+    VLOG_INFO << "Rendered " << num_samples << " samples in " << work_time << " seconds ("
              << work_time / num_samples
              << " seconds per sample), occupancy: " << statistics.occupancy;
  });
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -181,27 +181,45 @@ void PathTraceWorkGPU::alloc_integrator_queue()

 void PathTraceWorkGPU::alloc_integrator_sorting()
 {
+  /* Compute sort partitions, to balance between memory locality and coherence.
+   * Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of a
+   * more sophisticated heuristic we simply disable sort partitioning if the shader count is high.
+   */
+  num_sort_partitions_ = 1;
+  if (device_scene_->data.max_shaders < 300) {
+    const int num_elements = queue_->num_sort_partition_elements();
+    if (num_elements) {
+      num_sort_partitions_ = max(max_num_paths_ / num_elements, 1);
+    }
+  }
+
+  integrator_state_gpu_.sort_partition_divisor = (int)divide_up(max_num_paths_,
+                                                                num_sort_partitions_);
+
  /* Allocate arrays for shader sorting. */
-  const int max_shaders = device_scene_->data.max_shaders;
-  if (integrator_shader_sort_counter_.size() < max_shaders) {
-    integrator_shader_sort_counter_.alloc(max_shaders);
+  const int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;
+  if (integrator_shader_sort_counter_.size() < sort_buckets) {
+    integrator_shader_sort_counter_.alloc(sort_buckets);
    integrator_shader_sort_counter_.zero_to_device();
-
-    integrator_shader_raytrace_sort_counter_.alloc(max_shaders);
-    integrator_shader_raytrace_sort_counter_.zero_to_device();
-
-    integrator_shader_mnee_sort_counter_.alloc(max_shaders);
-    integrator_shader_mnee_sort_counter_.zero_to_device();
-
-    integrator_shader_sort_prefix_sum_.alloc(max_shaders);
-    integrator_shader_sort_prefix_sum_.zero_to_device();
-
    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
        (int *)integrator_shader_sort_counter_.device_pointer;
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
-        (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
-    integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
-        (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+      integrator_shader_raytrace_sort_counter_.alloc(sort_buckets);
+      integrator_shader_raytrace_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE] =
+          (int *)integrator_shader_raytrace_sort_counter_.device_pointer;
+    }
+
+    if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+      integrator_shader_mnee_sort_counter_.alloc(sort_buckets);
+      integrator_shader_mnee_sort_counter_.zero_to_device();
+      integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE] =
+          (int *)integrator_shader_mnee_sort_counter_.device_pointer;
+    }
+
+    integrator_shader_sort_prefix_sum_.alloc(sort_buckets);
+    integrator_shader_sort_prefix_sum_.zero_to_device();
  }
 }

@@ -333,8 +351,12 @@ void PathTraceWorkGPU::enqueue_reset()
  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
  queue_->zero_to_device(integrator_queue_counter_);
  queue_->zero_to_device(integrator_shader_sort_counter_);
-  queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
-  queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
+  }
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+    queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
+  }

  /* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
   * counter on the host side because `zero_to_device()` is not doing it. */
@@ -486,9 +508,9 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel,
  /* Compute prefix sum of number of active paths with each shader. */
  {
    const int work_size = 1;
-    int max_shaders = device_scene_->data.max_shaders;
+    int sort_buckets = device_scene_->data.max_shaders * num_sort_partitions_;

-    DeviceKernelArguments args(&d_counter, &d_prefix_sum, &max_shaders);
+    DeviceKernelArguments args(&d_counter, &d_prefix_sum, &sort_buckets);

    queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
  }
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -156,6 +156,9 @@ class PathTraceWorkGPU : public PathTraceWork {
  bool interop_use_checked_ = false;
  bool interop_use_ = false;

+  /* Number of partitions to sort state indices into prior to material sort. */
+  int num_sort_partitions_;
+
  /* Maximum number of concurrent integrator states. */
  int max_num_paths_;

--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -79,6 +79,7 @@ set(SRC_KERNEL_DEVICE_METAL_HEADERS
  device/metal/compat.h
  device/metal/context_begin.h
  device/metal/context_end.h
+  device/metal/function_constants.h
  device/metal/globals.h
 )

@@ -154,6 +155,7 @@ set(SRC_KERNEL_SVM_HEADERS
  svm/math_util.h
  svm/mix.h
  svm/musgrave.h
+  svm/node_types_template.h
  svm/noise.h
  svm/noisetex.h
  svm/normal.h
@@ -282,6 +284,7 @@ set(SRC_KERNEL_UTIL_HEADERS

 set(SRC_KERNEL_TYPES_HEADERS
  data_arrays.h
+  data_template.h
  tables.h
  types.h
 )
@@ -844,10 +847,9 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
  else()
    list(APPEND sycl_compiler_flags -fPIC)

-    # avoid getting __FAST_MATH__ to be defined for the graphics compiler on CentOS 7 until the compile-time issue it triggers gets fixed.
-    if(WITH_CYCLES_ONEAPI_BINARIES)
-      list(APPEND sycl_compiler_flags -fhonor-nans)
-    endif()
+    # We avoid getting __FAST_MATH__ to be defined when building on CentOS 7 until the compilation crash
+    # it triggers at either AoT or JIT stages gets fixed.
+    list(APPEND sycl_compiler_flags -fhonor-nans)

    # add $ORIGIN to cycles_kernel_oneapi.so rpath so libsycl.so and
    # libpi_level_zero.so can be placed next to it and get found.
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -172,11 +172,11 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
    ray_flags |= OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
  }

-  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
+  optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0,
             ray->P,
             ray->D,
-             0.0f,
-             ray->t,
+             ray->tmin,
+             ray->tmax,
             ray->time,
             ray_mask,
             ray_flags,
@@ -203,28 +203,28 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
 #elif defined(__METALRT__)

  if (!scene_intersect_valid(ray)) {
-    isect->t = ray->t;
+    isect->t = ray->tmax;
    isect->type = PRIMITIVE_NONE;
    return false;
  }

 #  if defined(__KERNEL_DEBUG__)
  if (is_null_instance_acceleration_structure(metal_ancillaries->accel_struct)) {
-    isect->t = ray->t;
+    isect->t = ray->tmax;
    isect->type = PRIMITIVE_NONE;
    kernel_assert(!"Invalid metal_ancillaries->accel_struct pointer");
    return false;
  }

  if (is_null_intersection_function_table(metal_ancillaries->ift_default)) {
-    isect->t = ray->t;
+    isect->t = ray->tmax;
    isect->type = PRIMITIVE_NONE;
    kernel_assert(!"Invalid ift_default");
    return false;
  }
 #  endif

-  metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t);
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
  metalrt_intersector_type metalrt_intersect;

  if (!kernel_data.bvh.have_curves) {
@@ -263,7 +263,7 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
 #  endif

  if (intersection.type == intersection_type::none) {
-    isect->t = ray->t;
+    isect->t = ray->tmax;
    isect->type = PRIMITIVE_NONE;

    return false;
@@ -295,14 +295,14 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
  }

 #  ifdef __EMBREE__
-  if (kernel_data.bvh.scene) {
-    isect->t = ray->t;
+  if (kernel_data.device_bvh) {
+    isect->t = ray->tmax;
    CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
    IntersectContext rtc_ctx(&ctx);
    RTCRayHit ray_hit;
    ctx.ray = ray;
    kernel_embree_setup_rayhit(*ray, ray_hit, visibility);
-    rtcIntersect1(kernel_data.bvh.scene, &rtc_ctx.context, &ray_hit);
+    rtcIntersect1(kernel_data.device_bvh, &rtc_ctx.context, &ray_hit);
    if (ray_hit.hit.geomID != RTC_INVALID_GEOMETRY_ID &&
        ray_hit.hit.primID != RTC_INVALID_GEOMETRY_ID) {
      kernel_embree_convert_hit(kg, &ray_hit.ray, &ray_hit.hit, isect);
@@ -357,11 +357,11 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
  if (local_isect) {
    local_isect->num_hits = 0; /* Initialize hit count to zero. */
  }
-  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
+  optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0,
             ray->P,
             ray->D,
-             0.0f,
-             ray->t,
+             ray->tmin,
+             ray->tmax,
             ray->time,
             0xFF,
             /* Need to always call into __anyhit__kernel_optix_local_hit. */
@@ -405,7 +405,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
  }
 #    endif

-  metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t);
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
  metalrt_intersector_type metalrt_intersect;

  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
@@ -451,7 +451,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
  }

 #    ifdef __EMBREE__
-  if (kernel_data.bvh.scene) {
+  if (kernel_data.device_bvh) {
    const bool has_bvh = !(kernel_data_fetch(object_flag, local_object) &
                           SD_OBJECT_TRANSFORM_APPLIED);
    CCLIntersectContext ctx(
@@ -470,13 +470,13 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,

    /* If this object has its own BVH, use it. */
    if (has_bvh) {
-      RTCGeometry geom = rtcGetGeometry(kernel_data.bvh.scene, local_object * 2);
+      RTCGeometry geom = rtcGetGeometry(kernel_data.device_bvh, local_object * 2);
      if (geom) {
        float3 P = ray->P;
        float3 dir = ray->D;
        float3 idir = ray->D;
        Transform ob_itfm;
-        rtc_ray.tfar = ray->t *
+        rtc_ray.tfar = ray->tmax *
                       bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
        /* bvh_instance_motion_push() returns the inverse transform but
         * it's not needed here. */
@@ -496,7 +496,7 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
      }
    }
    else {
-      rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
+      rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
    }

    /* rtcOccluded1 sets tfar to -inf if a hit was found. */
@@ -539,11 +539,11 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
    ray_mask = 0xFF;
  }

-  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
+  optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0,
             ray->P,
             ray->D,
-             0.0f,
-             ray->t,
+             ray->tmin,
+             ray->tmax,
             ray->time,
             ray_mask,
             /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */
@@ -582,7 +582,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
  }
 #    endif

-  metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t);
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
  metalrt_intersector_type metalrt_intersect;

  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
@@ -633,7 +633,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
  }

 #    ifdef __EMBREE__
-  if (kernel_data.bvh.scene) {
+  if (kernel_data.device_bvh) {
    CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
    Intersection *isect_array = (Intersection *)state->shadow_isect;
    ctx.isect_s = isect_array;
@@ -642,7 +642,7 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
    IntersectContext rtc_ctx(&ctx);
    RTCRay rtc_ray;
    kernel_embree_setup_ray(*ray, rtc_ray, visibility);
-    rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
+    rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);

    *num_recorded_hits = ctx.num_recorded_hits;
    *throughput = ctx.throughput;
@@ -698,11 +698,11 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
    ray_mask = 0xFF;
  }

-  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
+  optixTrace(scene_intersect_valid(ray) ? kernel_data.device_bvh : 0,
             ray->P,
             ray->D,
-             0.0f,
-             ray->t,
+             ray->tmin,
+             ray->tmax,
             ray->time,
             ray_mask,
             /* Need to always call into __anyhit__kernel_optix_volume_test. */
@@ -744,7 +744,7 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
  }
 #    endif

-  metal::raytracing::ray r(ray->P, ray->D, 0.0f, ray->t);
+  metal::raytracing::ray r(ray->P, ray->D, ray->tmin, ray->tmax);
  metalrt_intersector_type metalrt_intersect;

  metalrt_intersect.force_opacity(metal::raytracing::forced_opacity::non_opaque);
@@ -825,7 +825,7 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals kg,
  }

 #  ifdef __EMBREE__
-  if (kernel_data.bvh.scene) {
+  if (kernel_data.device_bvh) {
    CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
    ctx.isect_s = isect;
    ctx.max_hits = max_hits;
@@ -834,7 +834,7 @@ ccl_device_intersect uint scene_intersect_volume_all(KernelGlobals kg,
    IntersectContext rtc_ctx(&ctx);
    RTCRay rtc_ray;
    kernel_embree_setup_ray(*ray, rtc_ray, visibility);
-    rtcOccluded1(kernel_data.bvh.scene, &rtc_ctx.context, &rtc_ray);
+    rtcOccluded1(kernel_data.device_bvh, &rtc_ctx.context, &rtc_ray);
    return ctx.num_hits;
  }
 #  endif /* __EMBREE__ */
--- a/intern/cycles/kernel/bvh/embree.h
+++ b/intern/cycles/kernel/bvh/embree.h
@@ -83,8 +83,8 @@ ccl_device_inline void kernel_embree_setup_ray(const Ray &ray,
  rtc_ray.dir_x = ray.D.x;
  rtc_ray.dir_y = ray.D.y;
  rtc_ray.dir_z = ray.D.z;
-  rtc_ray.tnear = 0.0f;
-  rtc_ray.tfar = ray.t;
+  rtc_ray.tnear = ray.tmin;
+  rtc_ray.tfar = ray.tmax;
  rtc_ray.time = ray.time;
  rtc_ray.mask = visibility;
 }
@@ -107,7 +107,7 @@ ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg
    const int oID = hit->instID[0] / 2;
    if ((ray->self.object == oID) || (ray->self.light_object == oID)) {
      RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
-          rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0]));
+          rtcGetGeometry(kernel_data.device_bvh, hit->instID[0]));
      const int pID = hit->primID +
                      (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
      status = intersection_skip_self_shadow(ray->self, oID, pID);
@@ -117,7 +117,7 @@ ccl_device_inline bool kernel_embree_is_self_intersection(const KernelGlobals kg
    const int oID = hit->geomID / 2;
    if ((ray->self.object == oID) || (ray->self.light_object == oID)) {
      const int pID = hit->primID + (intptr_t)rtcGetGeometryUserData(
-                                        rtcGetGeometry(kernel_data.bvh.scene, hit->geomID));
+                                        rtcGetGeometry(kernel_data.device_bvh, hit->geomID));
      status = intersection_skip_self_shadow(ray->self, oID, pID);
    }
  }
@@ -133,14 +133,14 @@ ccl_device_inline void kernel_embree_convert_hit(KernelGlobals kg,
  isect->t = ray->tfar;
  if (hit->instID[0] != RTC_INVALID_GEOMETRY_ID) {
    RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
-        rtcGetGeometry(kernel_data.bvh.scene, hit->instID[0]));
+        rtcGetGeometry(kernel_data.device_bvh, hit->instID[0]));
    isect->prim = hit->primID +
                  (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
    isect->object = hit->instID[0] / 2;
  }
  else {
    isect->prim = hit->primID + (intptr_t)rtcGetGeometryUserData(
-                                    rtcGetGeometry(kernel_data.bvh.scene, hit->geomID));
+                                    rtcGetGeometry(kernel_data.device_bvh, hit->geomID));
    isect->object = hit->geomID / 2;
  }

@@ -166,7 +166,7 @@ ccl_device_inline void kernel_embree_convert_sss_hit(
  isect->v = hit->u;
  isect->t = ray->tfar;
  RTCScene inst_scene = (RTCScene)rtcGetGeometryUserData(
-      rtcGetGeometry(kernel_data.bvh.scene, object * 2));
+      rtcGetGeometry(kernel_data.device_bvh, object * 2));
  isect->prim = hit->primID +
                (intptr_t)rtcGetGeometryUserData(rtcGetGeometry(inst_scene, hit->geomID));
  isect->object = object;
--- a/intern/cycles/kernel/bvh/local.h
+++ b/intern/cycles/kernel/bvh/local.h
@@ -47,8 +47,9 @@ ccl_device_inline
  float3 P = ray->P;
  float3 dir = bvh_clamp_direction(ray->D);
  float3 idir = bvh_inverse_direction(dir);
+  float tmin = ray->tmin;
  int object = OBJECT_NONE;
-  float isect_t = ray->t;
+  float isect_t = ray->tmax;

  if (local_isect != NULL) {
    local_isect->num_hits = 0;
@@ -59,10 +60,13 @@ ccl_device_inline
  if (!(object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
 #if BVH_FEATURE(BVH_MOTION)
    Transform ob_itfm;
-    isect_t *= bvh_instance_motion_push(kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
+    const float t_world_to_instance = bvh_instance_motion_push(
+        kg, local_object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-    isect_t *= bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
+    const float t_world_to_instance = bvh_instance_push(kg, local_object, ray, &P, &dir, &idir);
 #endif
+    isect_t *= t_world_to_instance;
+    tmin *= t_world_to_instance;
    object = local_object;
  }

@@ -81,6 +85,7 @@ ccl_device_inline
                                       dir,
 #endif
                                       idir,
+                                       tmin,
                                       isect_t,
                                       node_addr,
                                       PATH_RAY_ALL_VISIBILITY,
@@ -155,6 +160,7 @@ ccl_device_inline
                                           local_object,
                                           prim,
                                           prim_addr,
+                                           tmin,
                                           isect_t,
                                           lcg_state,
                                           max_hits)) {
@@ -191,6 +197,7 @@ ccl_device_inline
                                                  local_object,
                                                  prim,
                                                  prim_addr,
+                                                  tmin,
                                                  isect_t,
                                                  lcg_state,
                                                  max_hits)) {
--- a/intern/cycles/kernel/bvh/nodes.h
+++ b/intern/cycles/kernel/bvh/nodes.h
@@ -18,7 +18,8 @@ ccl_device_forceinline Transform bvh_unaligned_node_fetch_space(KernelGlobals kg
 ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg,
                                                      const float3 P,
                                                      const float3 idir,
-                                                      const float t,
+                                                      const float tmin,
+                                                      const float tmax,
                                                      const int node_addr,
                                                      const uint visibility,
                                                      float dist[2])
@@ -39,8 +40,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg,
  float c0hiy = (node1.z - P.y) * idir.y;
  float c0loz = (node2.x - P.z) * idir.z;
  float c0hiz = (node2.z - P.z) * idir.z;
-  float c0min = max4(0.0f, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
-  float c0max = min4(t, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));
+  float c0min = max4(tmin, min(c0lox, c0hix), min(c0loy, c0hiy), min(c0loz, c0hiz));
+  float c0max = min4(tmax, max(c0lox, c0hix), max(c0loy, c0hiy), max(c0loz, c0hiz));

  float c1lox = (node0.y - P.x) * idir.x;
  float c1hix = (node0.w - P.x) * idir.x;
@@ -48,8 +49,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg,
  float c1hiy = (node1.w - P.y) * idir.y;
  float c1loz = (node2.y - P.z) * idir.z;
  float c1hiz = (node2.w - P.z) * idir.z;
-  float c1min = max4(0.0f, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
-  float c1max = min4(t, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));
+  float c1min = max4(tmin, min(c1lox, c1hix), min(c1loy, c1hiy), min(c1loz, c1hiz));
+  float c1max = min4(tmax, max(c1lox, c1hix), max(c1loy, c1hiy), max(c1loz, c1hiz));

  dist[0] = c0min;
  dist[1] = c1min;
@@ -66,7 +67,8 @@ ccl_device_forceinline int bvh_aligned_node_intersect(KernelGlobals kg,
 ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals kg,
                                                               const float3 P,
                                                               const float3 dir,
-                                                               const float t,
+                                                               const float tmin,
+                                                               const float tmax,
                                                               int node_addr,
                                                               int child,
                                                               float dist[2])
@@ -83,8 +85,8 @@ ccl_device_forceinline bool bvh_unaligned_node_intersect_child(KernelGlobals kg,
  const float far_x = max(lower_xyz.x, upper_xyz.x);
  const float far_y = max(lower_xyz.y, upper_xyz.y);
  const float far_z = max(lower_xyz.z, upper_xyz.z);
-  const float tnear = max4(0.0f, near_x, near_y, near_z);
-  const float tfar = min4(t, far_x, far_y, far_z);
+  const float tnear = max4(tmin, near_x, near_y, near_z);
+  const float tfar = min4(tmax, far_x, far_y, far_z);
  *dist = tnear;
  return tnear <= tfar;
 }
@@ -93,7 +95,8 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals kg,
                                                        const float3 P,
                                                        const float3 dir,
                                                        const float3 idir,
-                                                        const float t,
+                                                        const float tmin,
+                                                        const float tmax,
                                                        const int node_addr,
                                                        const uint visibility,
                                                        float dist[2])
@@ -102,7 +105,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals kg,
 #ifdef __VISIBILITY_FLAG__
  float4 cnodes = kernel_data_fetch(bvh_nodes, node_addr + 0);
 #endif
-  if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 0, &dist[0])) {
+  if (bvh_unaligned_node_intersect_child(kg, P, dir, tmin, tmax, node_addr, 0, &dist[0])) {
 #ifdef __VISIBILITY_FLAG__
    if ((__float_as_uint(cnodes.x) & visibility))
 #endif
@@ -110,7 +113,7 @@ ccl_device_forceinline int bvh_unaligned_node_intersect(KernelGlobals kg,
      mask |= 1;
    }
  }
-  if (bvh_unaligned_node_intersect_child(kg, P, dir, t, node_addr, 1, &dist[1])) {
+  if (bvh_unaligned_node_intersect_child(kg, P, dir, tmin, tmax, node_addr, 1, &dist[1])) {
 #ifdef __VISIBILITY_FLAG__
    if ((__float_as_uint(cnodes.y) & visibility))
 #endif
@@ -125,16 +128,17 @@ ccl_device_forceinline int bvh_node_intersect(KernelGlobals kg,
                                              const float3 P,
                                              const float3 dir,
                                              const float3 idir,
-                                              const float t,
+                                              const float tmin,
+                                              const float tmax,
                                              const int node_addr,
                                              const uint visibility,
                                              float dist[2])
 {
  float4 node = kernel_data_fetch(bvh_nodes, node_addr);
  if (__float_as_uint(node.x) & PATH_RAY_NODE_UNALIGNED) {
-    return bvh_unaligned_node_intersect(kg, P, dir, idir, t, node_addr, visibility, dist);
+    return bvh_unaligned_node_intersect(kg, P, dir, idir, tmin, tmax, node_addr, visibility, dist);
  }
  else {
-    return bvh_aligned_node_intersect(kg, P, idir, t, node_addr, visibility, dist);
+    return bvh_aligned_node_intersect(kg, P, idir, tmin, tmax, node_addr, visibility, dist);
  }
 }
--- a/intern/cycles/kernel/bvh/shadow_all.h
+++ b/intern/cycles/kernel/bvh/shadow_all.h
@@ -49,6 +49,7 @@ ccl_device_inline
  float3 P = ray->P;
  float3 dir = bvh_clamp_direction(ray->D);
  float3 idir = bvh_inverse_direction(dir);
+  float tmin = ray->tmin;
  int object = OBJECT_NONE;
  uint num_hits = 0;

@@ -59,12 +60,12 @@ ccl_device_inline
  /* Max distance in world space. May be dynamically reduced when max number of
   * recorded hits is exceeded and we no longer need to find hits beyond the max
   * distance found. */
-  float t_max_world = ray->t;
+  float t_max_world = ray->tmax;

  /* Current maximum distance to the intersection.
   * Is calculated as a ray length, transformed to an object space when entering
   * instance node. */
-  float t_max_current = ray->t;
+  float t_max_current = ray->tmax;

  /* Conversion from world to local space for the current instance if any, 1.0
   * otherwise. */
@@ -88,6 +89,7 @@ ccl_device_inline
                                       dir,
 #endif
                                       idir,
+                                       tmin,
                                       t_max_current,
                                       node_addr,
                                       visibility,
@@ -156,8 +158,16 @@ ccl_device_inline

            switch (type & PRIMITIVE_ALL) {
              case PRIMITIVE_TRIANGLE: {
-                hit = triangle_intersect(
-                    kg, &isect, P, dir, t_max_current, visibility, prim_object, prim, prim_addr);
+                hit = triangle_intersect(kg,
+                                         &isect,
+                                         P,
+                                         dir,
+                                         tmin,
+                                         t_max_current,
+                                         visibility,
+                                         prim_object,
+                                         prim,
+                                         prim_addr);
                break;
              }
 #if BVH_FEATURE(BVH_MOTION)
@@ -166,6 +176,7 @@ ccl_device_inline
                                                &isect,
                                                P,
                                                dir,
+                                                tmin,
                                                t_max_current,
                                                ray->time,
                                                visibility,
@@ -189,8 +200,16 @@ ccl_device_inline
                }

                const int curve_type = kernel_data_fetch(prim_type, prim_addr);
-                hit = curve_intersect(
-                    kg, &isect, P, dir, t_max_current, prim_object, prim, ray->time, curve_type);
+                hit = curve_intersect(kg,
+                                      &isect,
+                                      P,
+                                      dir,
+                                      tmin,
+                                      t_max_current,
+                                      prim_object,
+                                      prim,
+                                      ray->time,
+                                      curve_type);

                break;
              }
@@ -207,8 +226,16 @@ ccl_device_inline
                }

                const int point_type = kernel_data_fetch(prim_type, prim_addr);
-                hit = point_intersect(
-                    kg, &isect, P, dir, t_max_current, prim_object, prim, ray->time, point_type);
+                hit = point_intersect(kg,
+                                      &isect,
+                                      P,
+                                      dir,
+                                      tmin,
+                                      t_max_current,
+                                      prim_object,
+                                      prim,
+                                      ray->time,
+                                      point_type);
                break;
              }
 #endif /* BVH_FEATURE(BVH_POINTCLOUD) */
@@ -302,6 +329,7 @@ ccl_device_inline

          /* Convert intersection to object space. */
          t_max_current *= t_world_to_instance;
+          tmin *= t_world_to_instance;

          ++stack_ptr;
          kernel_assert(stack_ptr < BVH_STACK_SIZE);
@@ -323,7 +351,8 @@ ccl_device_inline
 #endif

      /* Restore world space ray length. */
-      t_max_current = ray->t;
+      tmin = ray->tmin;
+      t_max_current = ray->tmax;

      object = OBJECT_NONE;
      t_world_to_instance = 1.0f;
--- a/intern/cycles/kernel/bvh/traversal.h
+++ b/intern/cycles/kernel/bvh/traversal.h
@@ -43,13 +43,14 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
  float3 P = ray->P;
  float3 dir = bvh_clamp_direction(ray->D);
  float3 idir = bvh_inverse_direction(dir);
+  float tmin = ray->tmin;
  int object = OBJECT_NONE;

 #if BVH_FEATURE(BVH_MOTION)
  Transform ob_itfm;
 #endif

-  isect->t = ray->t;
+  isect->t = ray->tmax;
  isect->u = 0.0f;
  isect->v = 0.0f;
  isect->prim = PRIM_NONE;
@@ -71,6 +72,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
                                         dir,
 #endif
                                         idir,
+                                         tmin,
                                         isect->t,
                                         node_addr,
                                         visibility,
@@ -133,8 +135,16 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,

            switch (type & PRIMITIVE_ALL) {
              case PRIMITIVE_TRIANGLE: {
-                if (triangle_intersect(
-                        kg, isect, P, dir, isect->t, visibility, prim_object, prim, prim_addr)) {
+                if (triangle_intersect(kg,
+                                       isect,
+                                       P,
+                                       dir,
+                                       tmin,
+                                       isect->t,
+                                       visibility,
+                                       prim_object,
+                                       prim,
+                                       prim_addr)) {
                  /* shadow ray early termination */
                  if (visibility & PATH_RAY_SHADOW_OPAQUE)
                    return true;
@@ -147,6 +157,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
                                              isect,
                                              P,
                                              dir,
+                                              tmin,
                                              isect->t,
                                              ray->time,
                                              visibility,
@@ -174,7 +185,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,

                const int curve_type = kernel_data_fetch(prim_type, prim_addr);
                const bool hit = curve_intersect(
-                    kg, isect, P, dir, isect->t, prim_object, prim, ray->time, curve_type);
+                    kg, isect, P, dir, tmin, isect->t, prim_object, prim, ray->time, curve_type);
                if (hit) {
                  /* shadow ray early termination */
                  if (visibility & PATH_RAY_SHADOW_OPAQUE)
@@ -195,7 +206,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,

                const int point_type = kernel_data_fetch(prim_type, prim_addr);
                const bool hit = point_intersect(
-                    kg, isect, P, dir, isect->t, prim_object, prim, ray->time, point_type);
+                    kg, isect, P, dir, tmin, isect->t, prim_object, prim, ray->time, point_type);
                if (hit) {
                  /* shadow ray early termination */
                  if (visibility & PATH_RAY_SHADOW_OPAQUE)
@@ -212,11 +223,15 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
          object = kernel_data_fetch(prim_object, -prim_addr - 1);

 #if BVH_FEATURE(BVH_MOTION)
-          isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
+          const float t_world_to_instance = bvh_instance_motion_push(
+              kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-          isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+          const float t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif

+          isect->t *= t_world_to_instance;
+          tmin *= t_world_to_instance;
+
          ++stack_ptr;
          kernel_assert(stack_ptr < BVH_STACK_SIZE);
          traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
@@ -235,6 +250,7 @@ ccl_device_noinline bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals kg,
 #else
      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #endif
+      tmin = ray->tmin;

      object = OBJECT_NONE;
      node_addr = traversal_stack[stack_ptr];
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -5,6 +5,19 @@

 CCL_NAMESPACE_BEGIN

+/* Offset intersection distance by the smallest possible amount, to skip
+ * intersections at this distance. This works in cases where the ray start
+ * position is unchanged and only tmin is updated, since for self
+ * intersection we'll be comparing against the exact same distances. */
+ccl_device_forceinline float intersection_t_offset(const float t)
+{
+  /* This is a simplified version of nextafterf(t, FLT_MAX), only dealing with
+   * non-negative and finite t. */
+  kernel_assert(t >= 0.0f && isfinite_safe(t));
+  const uint32_t bits = (t == 0.0f) ? 1 : __float_as_uint(t) + 1;
+  return __uint_as_float(bits);
+}
+
 #if defined(__KERNEL_CPU__)
 ccl_device int intersections_compare(const void *a, const void *b)
 {
--- a/intern/cycles/kernel/bvh/volume.h
+++ b/intern/cycles/kernel/bvh/volume.h
@@ -46,13 +46,14 @@ ccl_device_inline
  float3 P = ray->P;
  float3 dir = bvh_clamp_direction(ray->D);
  float3 idir = bvh_inverse_direction(dir);
+  float tmin = ray->tmin;
  int object = OBJECT_NONE;

 #if BVH_FEATURE(BVH_MOTION)
  Transform ob_itfm;
 #endif

-  isect->t = ray->t;
+  isect->t = ray->tmax;
  isect->u = 0.0f;
  isect->v = 0.0f;
  isect->prim = PRIM_NONE;
@@ -73,6 +74,7 @@ ccl_device_inline
                                       dir,
 #endif
                                       idir,
+                                       tmin,
                                       isect->t,
                                       node_addr,
                                       visibility,
@@ -140,7 +142,7 @@ ccl_device_inline
                  continue;
                }
                triangle_intersect(
-                    kg, isect, P, dir, isect->t, visibility, prim_object, prim, prim_addr);
+                    kg, isect, P, dir, tmin, isect->t, visibility, prim_object, prim, prim_addr);
              }
              break;
            }
@@ -165,6 +167,7 @@ ccl_device_inline
                                          isect,
                                          P,
                                          dir,
+                                          tmin,
                                          isect->t,
                                          ray->time,
                                          visibility,
@@ -186,11 +189,15 @@ ccl_device_inline
          int object_flag = kernel_data_fetch(object_flag, object);
          if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect->t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
+            const float t_world_to_instance = bvh_instance_motion_push(
+                kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-            isect->t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+            const float t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif

+            isect->t *= t_world_to_instance;
+            tmin *= t_world_to_instance;
+
            ++stack_ptr;
            kernel_assert(stack_ptr < BVH_STACK_SIZE);
            traversal_stack[stack_ptr] = ENTRYPOINT_SENTINEL;
@@ -217,6 +224,8 @@ ccl_device_inline
      isect->t = bvh_instance_pop(kg, object, ray, &P, &dir, &idir, isect->t);
 #endif

+      tmin = ray->tmin;
+
      object = OBJECT_NONE;
      node_addr = traversal_stack[stack_ptr];
      --stack_ptr;
--- a/intern/cycles/kernel/bvh/volume_all.h
+++ b/intern/cycles/kernel/bvh/volume_all.h
@@ -44,12 +44,12 @@ ccl_device_inline
  int node_addr = kernel_data.bvh.root;

  /* ray parameters in registers */
-  const float tmax = ray->t;
  float3 P = ray->P;
  float3 dir = bvh_clamp_direction(ray->D);
  float3 idir = bvh_inverse_direction(dir);
+  float tmin = ray->tmin;
  int object = OBJECT_NONE;
-  float isect_t = tmax;
+  float isect_t = ray->tmax;

 #if BVH_FEATURE(BVH_MOTION)
  Transform ob_itfm;
@@ -58,7 +58,7 @@ ccl_device_inline
  int num_hits_in_instance = 0;

  uint num_hits = 0;
-  isect_array->t = tmax;
+  isect_array->t = ray->tmax;

  /* traversal loop */
  do {
@@ -75,6 +75,7 @@ ccl_device_inline
                                       dir,
 #endif
                                       idir,
+                                       tmin,
                                       isect_t,
                                       node_addr,
                                       visibility,
@@ -141,8 +142,16 @@ ccl_device_inline
                if ((object_flag & SD_OBJECT_HAS_VOLUME) == 0) {
                  continue;
                }
-                hit = triangle_intersect(
-                    kg, isect_array, P, dir, isect_t, visibility, prim_object, prim, prim_addr);
+                hit = triangle_intersect(kg,
+                                         isect_array,
+                                         P,
+                                         dir,
+                                         tmin,
+                                         isect_t,
+                                         visibility,
+                                         prim_object,
+                                         prim,
+                                         prim_addr);
                if (hit) {
                  /* Move on to next entry in intersections array. */
                  isect_array++;
@@ -189,6 +198,7 @@ ccl_device_inline
                                                isect_array,
                                                P,
                                                dir,
+                                                tmin,
                                                isect_t,
                                                ray->time,
                                                visibility,
@@ -232,11 +242,15 @@ ccl_device_inline
          int object_flag = kernel_data_fetch(object_flag, object);
          if (object_flag & SD_OBJECT_HAS_VOLUME) {
 #if BVH_FEATURE(BVH_MOTION)
-            isect_t *= bvh_instance_motion_push(kg, object, ray, &P, &dir, &idir, &ob_itfm);
+            const float t_world_to_instance = bvh_instance_motion_push(
+                kg, object, ray, &P, &dir, &idir, &ob_itfm);
 #else
-            isect_t *= bvh_instance_push(kg, object, ray, &P, &dir, &idir);
+            const float t_world_to_instance = bvh_instance_push(kg, object, ray, &P, &dir, &idir);
 #endif

+            isect_t *= t_world_to_instance;
+            tmin *= t_world_to_instance;
+
            num_hits_in_instance = 0;
            isect_array->t = isect_t;

@@ -280,7 +294,8 @@ ccl_device_inline
 #endif
      }

-      isect_t = tmax;
+      tmin = ray->tmin;
+      isect_t = ray->tmax;
      isect_array->t = isect_t;

      object = OBJECT_NONE;
--- a/intern/cycles/kernel/camera/camera.h
+++ b/intern/cycles/kernel/camera/camera.h
@@ -165,9 +165,11 @@ ccl_device void camera_sample_perspective(KernelGlobals kg,
  float nearclip = kernel_data.cam.nearclip * z_inv;
  ray->P += nearclip * ray->D;
  ray->dP += nearclip * ray->dD;
-  ray->t = kernel_data.cam.cliplength * z_inv;
+  ray->tmin = 0.0f;
+  ray->tmax = kernel_data.cam.cliplength * z_inv;
 #else
-  ray->t = FLT_MAX;
+  ray->tmin = 0.0f;
+  ray->tmax = FLT_MAX;
 #endif
 }

@@ -231,9 +233,11 @@ ccl_device void camera_sample_orthographic(KernelGlobals kg,

 #ifdef __CAMERA_CLIPPING__
  /* clipping */
-  ray->t = kernel_data.cam.cliplength;
+  ray->tmin = 0.0f;
+  ray->tmax = kernel_data.cam.cliplength;
 #else
-  ray->t = FLT_MAX;
+  ray->tmin = 0.0f;
+  ray->tmax = FLT_MAX;
 #endif
 }

@@ -258,7 +262,7 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,

  /* indicates ray should not receive any light, outside of the lens */
  if (is_zero(D)) {
-    ray->t = 0.0f;
+    ray->tmax = 0.0f;
    return;
  }

@@ -349,9 +353,11 @@ ccl_device_inline void camera_sample_panorama(ccl_constant KernelCamera *cam,
  float nearclip = cam->nearclip;
  ray->P += nearclip * ray->D;
  ray->dP += nearclip * ray->dD;
-  ray->t = cam->cliplength;
+  ray->tmin = 0.0f;
+  ray->tmax = cam->cliplength;
 #else
-  ray->t = FLT_MAX;
+  ray->tmin = 0.0f;
+  ray->tmax = FLT_MAX;
 #endif
 }

@@ -368,7 +374,7 @@ ccl_device_inline void camera_sample(KernelGlobals kg,
                                     ccl_private Ray *ray)
 {
  /* pixel filter */
-  int filter_table_offset = kernel_data.film.filter_table_offset;
+  int filter_table_offset = kernel_data.tables.filter_table_offset;
  float raster_x = x + lookup_table_read(kg, filter_u, filter_table_offset, FILTER_TABLE_SIZE);
  float raster_y = y + lookup_table_read(kg, filter_v, filter_table_offset, FILTER_TABLE_SIZE);

--- a/intern/cycles/kernel/data_template.h
+++ b/intern/cycles/kernel/data_template.h
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef KERNEL_STRUCT_BEGIN
+#  define KERNEL_STRUCT_BEGIN(name, parent)
+#endif
+#ifndef KERNEL_STRUCT_END
+#  define KERNEL_STRUCT_END(name)
+#endif
+#ifndef KERNEL_STRUCT_MEMBER
+#  define KERNEL_STRUCT_MEMBER(parent, type, name)
+#endif
+
+/* Background. */
+
+KERNEL_STRUCT_BEGIN(KernelBackground, background)
+/* xyz store direction, w the angle. float4 instead of float3 is used
+ * to ensure consistent padding/alignment across devices. */
+KERNEL_STRUCT_MEMBER(background, float4, sun)
+/* Only shader index. */
+KERNEL_STRUCT_MEMBER(background, int, surface_shader)
+KERNEL_STRUCT_MEMBER(background, int, volume_shader)
+KERNEL_STRUCT_MEMBER(background, float, volume_step_size)
+KERNEL_STRUCT_MEMBER(background, int, transparent)
+KERNEL_STRUCT_MEMBER(background, float, transparent_roughness_squared_threshold)
+/* Portal sampling. */
+KERNEL_STRUCT_MEMBER(background, float, portal_weight)
+KERNEL_STRUCT_MEMBER(background, int, num_portals)
+KERNEL_STRUCT_MEMBER(background, int, portal_offset)
+/* Sun sampling. */
+KERNEL_STRUCT_MEMBER(background, float, sun_weight)
+/* Importance map sampling. */
+KERNEL_STRUCT_MEMBER(background, float, map_weight)
+KERNEL_STRUCT_MEMBER(background, int, map_res_x)
+KERNEL_STRUCT_MEMBER(background, int, map_res_y)
+/* Multiple importance sampling. */
+KERNEL_STRUCT_MEMBER(background, int, use_mis)
+/* Lightgroup. */
+KERNEL_STRUCT_MEMBER(background, int, lightgroup)
+/* Padding. */
+KERNEL_STRUCT_MEMBER(background, int, pad1)
+KERNEL_STRUCT_MEMBER(background, int, pad2)
+KERNEL_STRUCT_MEMBER(background, int, pad3)
+KERNEL_STRUCT_END(KernelBackground)
+
+/* BVH: own BVH2 if no native device acceleration struct used. */
+
+KERNEL_STRUCT_BEGIN(KernelBVH, bvh)
+KERNEL_STRUCT_MEMBER(bvh, int, root)
+KERNEL_STRUCT_MEMBER(bvh, int, have_motion)
+KERNEL_STRUCT_MEMBER(bvh, int, have_curves)
+KERNEL_STRUCT_MEMBER(bvh, int, bvh_layout)
+KERNEL_STRUCT_MEMBER(bvh, int, use_bvh_steps)
+KERNEL_STRUCT_MEMBER(bvh, int, curve_subdivisions)
+KERNEL_STRUCT_MEMBER(bvh, int, pad1)
+KERNEL_STRUCT_MEMBER(bvh, int, pad2)
+KERNEL_STRUCT_END(KernelBVH)
+
+/* Film. */
+
+KERNEL_STRUCT_BEGIN(KernelFilm, film)
+/* XYZ to rendering color space transform. float4 instead of float3 to
+ * ensure consistent padding/alignment across devices. */
+KERNEL_STRUCT_MEMBER(film, float4, xyz_to_r)
+KERNEL_STRUCT_MEMBER(film, float4, xyz_to_g)
+KERNEL_STRUCT_MEMBER(film, float4, xyz_to_b)
+KERNEL_STRUCT_MEMBER(film, float4, rgb_to_y)
+/* Rec709 to rendering color space. */
+KERNEL_STRUCT_MEMBER(film, float4, rec709_to_r)
+KERNEL_STRUCT_MEMBER(film, float4, rec709_to_g)
+KERNEL_STRUCT_MEMBER(film, float4, rec709_to_b)
+KERNEL_STRUCT_MEMBER(film, int, is_rec709)
+/* Exposuse. */
+KERNEL_STRUCT_MEMBER(film, float, exposure)
+/* Passed used. */
+KERNEL_STRUCT_MEMBER(film, int, pass_flag)
+KERNEL_STRUCT_MEMBER(film, int, light_pass_flag)
+/* Pass offsets. */
+KERNEL_STRUCT_MEMBER(film, int, pass_stride)
+KERNEL_STRUCT_MEMBER(film, int, pass_combined)
+KERNEL_STRUCT_MEMBER(film, int, pass_depth)
+KERNEL_STRUCT_MEMBER(film, int, pass_position)
+KERNEL_STRUCT_MEMBER(film, int, pass_normal)
+KERNEL_STRUCT_MEMBER(film, int, pass_roughness)
+KERNEL_STRUCT_MEMBER(film, int, pass_motion)
+KERNEL_STRUCT_MEMBER(film, int, pass_motion_weight)
+KERNEL_STRUCT_MEMBER(film, int, pass_uv)
+KERNEL_STRUCT_MEMBER(film, int, pass_object_id)
+KERNEL_STRUCT_MEMBER(film, int, pass_material_id)
+KERNEL_STRUCT_MEMBER(film, int, pass_diffuse_color)
+KERNEL_STRUCT_MEMBER(film, int, pass_glossy_color)
+KERNEL_STRUCT_MEMBER(film, int, pass_transmission_color)
+KERNEL_STRUCT_MEMBER(film, int, pass_diffuse_indirect)
+KERNEL_STRUCT_MEMBER(film, int, pass_glossy_indirect)
+KERNEL_STRUCT_MEMBER(film, int, pass_transmission_indirect)
+KERNEL_STRUCT_MEMBER(film, int, pass_volume_indirect)
+KERNEL_STRUCT_MEMBER(film, int, pass_diffuse_direct)
+KERNEL_STRUCT_MEMBER(film, int, pass_glossy_direct)
+KERNEL_STRUCT_MEMBER(film, int, pass_transmission_direct)
+KERNEL_STRUCT_MEMBER(film, int, pass_volume_direct)
+KERNEL_STRUCT_MEMBER(film, int, pass_emission)
+KERNEL_STRUCT_MEMBER(film, int, pass_background)
+KERNEL_STRUCT_MEMBER(film, int, pass_ao)
+KERNEL_STRUCT_MEMBER(film, float, pass_alpha_threshold)
+KERNEL_STRUCT_MEMBER(film, int, pass_shadow)
+KERNEL_STRUCT_MEMBER(film, float, pass_shadow_scale)
+KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher)
+KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_sample_count)
+KERNEL_STRUCT_MEMBER(film, int, pass_shadow_catcher_matte)
+/* Cryptomatte. */
+KERNEL_STRUCT_MEMBER(film, int, cryptomatte_passes)
+KERNEL_STRUCT_MEMBER(film, int, cryptomatte_depth)
+KERNEL_STRUCT_MEMBER(film, int, pass_cryptomatte)
+/* Adaptive sampling. */
+KERNEL_STRUCT_MEMBER(film, int, pass_adaptive_aux_buffer)
+KERNEL_STRUCT_MEMBER(film, int, pass_sample_count)
+/* Mist. */
+KERNEL_STRUCT_MEMBER(film, int, pass_mist)
+KERNEL_STRUCT_MEMBER(film, float, mist_start)
+KERNEL_STRUCT_MEMBER(film, float, mist_inv_depth)
+KERNEL_STRUCT_MEMBER(film, float, mist_falloff)
+/* Denoising. */
+KERNEL_STRUCT_MEMBER(film, int, pass_denoising_normal)
+KERNEL_STRUCT_MEMBER(film, int, pass_denoising_albedo)
+KERNEL_STRUCT_MEMBER(film, int, pass_denoising_depth)
+/* AOVs. */
+KERNEL_STRUCT_MEMBER(film, int, pass_aov_color)
+KERNEL_STRUCT_MEMBER(film, int, pass_aov_value)
+/* Light groups. */
+KERNEL_STRUCT_MEMBER(film, int, pass_lightgroup)
+/* Baking. */
+KERNEL_STRUCT_MEMBER(film, int, pass_bake_primitive)
+KERNEL_STRUCT_MEMBER(film, int, pass_bake_differential)
+/* Shadow catcher. */
+KERNEL_STRUCT_MEMBER(film, int, use_approximate_shadow_catcher)
+/* Padding. */
+KERNEL_STRUCT_MEMBER(film, int, pad1)
+KERNEL_STRUCT_MEMBER(film, int, pad2)
+KERNEL_STRUCT_END(KernelFilm)
+
+/* Integrator. */
+
+KERNEL_STRUCT_BEGIN(KernelIntegrator, integrator)
+/* Emission. */
+KERNEL_STRUCT_MEMBER(integrator, int, use_direct_light)
+KERNEL_STRUCT_MEMBER(integrator, int, num_distribution)
+KERNEL_STRUCT_MEMBER(integrator, int, num_all_lights)
+KERNEL_STRUCT_MEMBER(integrator, float, pdf_triangles)
+KERNEL_STRUCT_MEMBER(integrator, float, pdf_lights)
+KERNEL_STRUCT_MEMBER(integrator, float, light_inv_rr_threshold)
+/* Bounces. */
+KERNEL_STRUCT_MEMBER(integrator, int, min_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, max_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, max_diffuse_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, max_glossy_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, max_transmission_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, max_volume_bounce)
+/* AO bounces. */
+KERNEL_STRUCT_MEMBER(integrator, int, ao_bounces)
+KERNEL_STRUCT_MEMBER(integrator, float, ao_bounces_distance)
+KERNEL_STRUCT_MEMBER(integrator, float, ao_bounces_factor)
+KERNEL_STRUCT_MEMBER(integrator, float, ao_additive_factor)
+/* Transparency. */
+KERNEL_STRUCT_MEMBER(integrator, int, transparent_min_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, transparent_max_bounce)
+KERNEL_STRUCT_MEMBER(integrator, int, transparent_shadows)
+/* Caustics. */
+KERNEL_STRUCT_MEMBER(integrator, int, caustics_reflective)
+KERNEL_STRUCT_MEMBER(integrator, int, caustics_refractive)
+KERNEL_STRUCT_MEMBER(integrator, float, filter_glossy)
+/* Seed. */
+KERNEL_STRUCT_MEMBER(integrator, int, seed)
+/* Clamp. */
+KERNEL_STRUCT_MEMBER(integrator, float, sample_clamp_direct)
+KERNEL_STRUCT_MEMBER(integrator, float, sample_clamp_indirect)
+/* MIS. */
+KERNEL_STRUCT_MEMBER(integrator, int, use_lamp_mis)
+/* Caustics. */
+KERNEL_STRUCT_MEMBER(integrator, int, use_caustics)
+/* Sampling pattern. */
+KERNEL_STRUCT_MEMBER(integrator, int, sampling_pattern)
+KERNEL_STRUCT_MEMBER(integrator, float, scrambling_distance)
+/* Volume render. */
+KERNEL_STRUCT_MEMBER(integrator, int, use_volumes)
+KERNEL_STRUCT_MEMBER(integrator, int, volume_max_steps)
+KERNEL_STRUCT_MEMBER(integrator, float, volume_step_rate)
+/* Shadow catcher. */
+KERNEL_STRUCT_MEMBER(integrator, int, has_shadow_catcher)
+/* Closure filter. */
+KERNEL_STRUCT_MEMBER(integrator, int, filter_closures)
+/* MIS debugging. */
+KERNEL_STRUCT_MEMBER(integrator, int, direct_light_sampling_type)
+/* Padding */
+KERNEL_STRUCT_MEMBER(integrator, int, pad1)
+KERNEL_STRUCT_END(KernelIntegrator)
+
+/* SVM. For shader specialization. */
+
+KERNEL_STRUCT_BEGIN(KernelSVMUsage, svm_usage)
+#define SHADER_NODE_TYPE(type) KERNEL_STRUCT_MEMBER(svm_usage, int, type)
+#include "kernel/svm/node_types_template.h"
+KERNEL_STRUCT_END(KernelSVMUsage)
+
+#undef KERNEL_STRUCT_BEGIN
+#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_END
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -246,7 +246,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
 ccl_gpu_kernel_postfix

 #if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
-constant int __dummy_constant [[function_constant(0)]];
+constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
 #endif

 ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
--- a/intern/cycles/kernel/device/metal/function_constants.h
+++ b/intern/cycles/kernel/device/metal/function_constants.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2021-2022 Blender Foundation */
+
+enum {
+  Kernel_DummyConstant,
+#define KERNEL_STRUCT_MEMBER(parent, type, name) KernelData_##parent##_##name,
+#include "kernel/data_template.h"
+};
+
+#ifdef __KERNEL_METAL__
+#  define KERNEL_STRUCT_MEMBER(parent, type, name) \
+    constant type kernel_data_##parent##_##name \
+        [[function_constant(KernelData_##parent##_##name)]];
+#  include "kernel/data_template.h"
+#endif
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -5,6 +5,7 @@

 #include "kernel/device/metal/compat.h"
 #include "kernel/device/metal/globals.h"
+#include "kernel/device/metal/function_constants.h"
 #include "kernel/device/gpu/kernel.h"

 /* MetalRT intersection handlers */
@@ -409,6 +410,7 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
                                const float3 ray_origin,
                                const float3 ray_direction,
                                float time,
+                                const float ray_tmin,
                                const float ray_tmax,
                                thread BoundingBoxIntersectionResult &result)
 {
@@ -433,7 +435,7 @@ void metalrt_intersection_curve(constant KernelParamsMetal &launch_params_metal,
    isect.t *= len;

  MetalKernelContext context(launch_params_metal);
-  if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+  if (context.curve_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
    result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
                  launch_params_metal, payload, object, prim, isect.u);
    if (result.accept) {
@@ -455,6 +457,7 @@ void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params
                                       const float3 ray_origin,
                                       const float3 ray_direction,
                                       float time,
+                                       const float ray_tmin,
                                       const float ray_tmax,
                                       thread BoundingBoxIntersectionResult &result)
 {
@@ -474,7 +477,7 @@ void metalrt_intersection_curve_shadow(constant KernelParamsMetal &launch_params
    isect.t *= len;

  MetalKernelContext context(launch_params_metal);
-  if (context.curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+  if (context.curve_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
    result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
    result.accept = !result.continue_search;
@@ -493,6 +496,7 @@ __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[b
                             const uint primitive_id [[primitive_id]],
                             const float3 ray_origin [[origin]],
                             const float3 ray_direction [[direction]],
+                             const float ray_tmin [[min_distance]],
                             const float ray_tmax [[max_distance]])
 {
  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
@@ -510,7 +514,7 @@ __intersection__curve_ribbon(constant KernelParamsMetal &launch_params_metal [[b
 #  else
                               0.0f,
 #  endif
-                               ray_tmax, result);
+                               ray_tmin, ray_tmax, result);
  }

  return result;
@@ -524,6 +528,7 @@ __intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_me
                                    const uint primitive_id [[primitive_id]],
                                    const float3 ray_origin [[origin]],
                                    const float3 ray_direction [[direction]],
+                                    const float ray_tmin [[min_distance]],
                                    const float ray_tmax [[max_distance]])
 {
  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
@@ -541,7 +546,7 @@ __intersection__curve_ribbon_shadow(constant KernelParamsMetal &launch_params_me
 #  else
                               0.0f,
 #  endif
-                               ray_tmax, result);
+                               ray_tmin, ray_tmax, result);
  }

  return result;
@@ -555,6 +560,7 @@ __intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buff
                          const uint primitive_id [[primitive_id]],
                          const float3 ray_origin [[origin]],
                          const float3 ray_direction [[direction]],
+                          const float ray_tmin [[min_distance]],
                          const float ray_tmax [[max_distance]])
 {
  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
@@ -570,7 +576,7 @@ __intersection__curve_all(constant KernelParamsMetal &launch_params_metal [[buff
 #  else
                             0.0f,
 #  endif
-                             ray_tmax, result);
+                             ray_tmin, ray_tmax, result);

  return result;
 }
@@ -583,6 +589,7 @@ __intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal
                                 const uint primitive_id [[primitive_id]],
                                 const float3 ray_origin [[origin]],
                                 const float3 ray_direction [[direction]],
+                                 const float ray_tmin [[min_distance]],
                                 const float ray_tmax [[max_distance]])
 {
  uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
@@ -599,7 +606,7 @@ __intersection__curve_all_shadow(constant KernelParamsMetal &launch_params_metal
 #  else
                             0.0f,
 #  endif
-                             ray_tmax, result);
+                             ray_tmin, ray_tmax, result);

  return result;
 }
@@ -615,6 +622,7 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
                                const float3 ray_origin,
                                const float3 ray_direction,
                                float time,
+                                const float ray_tmin,
                                const float ray_tmax,
                                thread BoundingBoxIntersectionResult &result)
 {
@@ -639,7 +647,7 @@ void metalrt_intersection_point(constant KernelParamsMetal &launch_params_metal,
    isect.t *= len;

  MetalKernelContext context(launch_params_metal);
-  if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+  if (context.point_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
    result = metalrt_visibility_test<BoundingBoxIntersectionResult, METALRT_HIT_BOUNDING_BOX>(
                  launch_params_metal, payload, object, prim, isect.u);
    if (result.accept) {
@@ -661,6 +669,7 @@ void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params
                                       const float3 ray_origin,
                                       const float3 ray_direction,
                                       float time,
+                                       const float ray_tmin,
                                       const float ray_tmax,
                                       thread BoundingBoxIntersectionResult &result)
 {
@@ -680,7 +689,7 @@ void metalrt_intersection_point_shadow(constant KernelParamsMetal &launch_params
    isect.t *= len;

  MetalKernelContext context(launch_params_metal);
-  if (context.point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+  if (context.point_intersect(NULL, &isect, P, dir, ray_tmin, isect.t, object, prim, time, type)) {
    result.continue_search = metalrt_shadow_all_hit<METALRT_HIT_BOUNDING_BOX>(
                launch_params_metal, payload, object, prim, float2(isect.u, isect.v), ray_tmax);
    result.accept = !result.continue_search;
@@ -699,6 +708,7 @@ __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1
                             const uint primitive_id [[primitive_id]],
                             const float3 ray_origin [[origin]],
                             const float3 ray_direction [[direction]],
+                             const float ray_tmin [[min_distance]],
                             const float ray_tmax [[max_distance]])
 {
  const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
@@ -715,7 +725,7 @@ __intersection__point(constant KernelParamsMetal &launch_params_metal [[buffer(1
 #  else
                             0.0f,
 #  endif
-                             ray_tmax, result);
+                             ray_tmin, ray_tmax, result);

  return result;
 }
@@ -728,6 +738,7 @@ __intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[b
                                    const uint primitive_id [[primitive_id]],
                                    const float3 ray_origin [[origin]],
                                    const float3 ray_direction [[direction]],
+                                    const float ray_tmin [[min_distance]],
                                    const float ray_tmax [[max_distance]])
 {
  const uint prim = primitive_id + kernel_data_fetch(object_prim_offset, object);
@@ -744,7 +755,7 @@ __intersection__point_shadow(constant KernelParamsMetal &launch_params_metal [[b
 #  else
                             0.0f,
 #  endif
-                             ray_tmax, result);
+                             ray_tmin, ray_tmax, result);

  return result;
 }
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@@ -670,7 +670,7 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
 }

 static const int lowest_supported_driver_version_win = 1011660;
-static const int lowest_supported_driver_version_neo = 20066;
+static const int lowest_supported_driver_version_neo = 23570;

 static int parse_driver_build_version(const sycl::device &device)
 {
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -51,32 +51,36 @@ ccl_device_forceinline int get_object_id()
 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_closest()
 {
  const int global_index = optixGetLaunchIndex().x;
-  const int path_index = (kernel_params.path_index_array) ? kernel_params.path_index_array[global_index] :
-                                                       global_index;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
  integrator_intersect_closest(nullptr, path_index, kernel_params.render_buffer);
 }

 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_shadow()
 {
  const int global_index = optixGetLaunchIndex().x;
-  const int path_index = (kernel_params.path_index_array) ? kernel_params.path_index_array[global_index] :
-                                                       global_index;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
  integrator_intersect_shadow(nullptr, path_index);
 }

 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_subsurface()
 {
  const int global_index = optixGetLaunchIndex().x;
-  const int path_index = (kernel_params.path_index_array) ? kernel_params.path_index_array[global_index] :
-                                                       global_index;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
  integrator_intersect_subsurface(nullptr, path_index);
 }

 extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_stack()
 {
  const int global_index = optixGetLaunchIndex().x;
-  const int path_index = (kernel_params.path_index_array) ? kernel_params.path_index_array[global_index] :
-                                                       global_index;
+  const int path_index = (kernel_params.path_index_array) ?
+                             kernel_params.path_index_array[global_index] :
+                             global_index;
  integrator_intersect_volume_stack(nullptr, path_index);
 }

@@ -408,6 +412,7 @@ ccl_device_inline void optix_intersection_curve(const int prim, const int type)

  float3 P = optixGetObjectRayOrigin();
  float3 dir = optixGetObjectRayDirection();
+  float tmin = optixGetRayTmin();

  /* The direction is not normalized by default, but the curve intersection routine expects that */
  float len;
@@ -425,7 +430,7 @@ ccl_device_inline void optix_intersection_curve(const int prim, const int type)
  if (isect.t != FLT_MAX)
    isect.t *= len;

-  if (curve_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+  if (curve_intersect(NULL, &isect, P, dir, tmin, isect.t, object, prim, time, type)) {
    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
    optixReportIntersection(isect.t / len,
                            type & PRIMITIVE_ALL,
@@ -462,6 +467,7 @@ extern "C" __global__ void __intersection__point()

  float3 P = optixGetObjectRayOrigin();
  float3 dir = optixGetObjectRayDirection();
+  float tmin = optixGetRayTmin();

  /* The direction is not normalized by default, the point intersection routine expects that. */
  float len;
@@ -480,7 +486,7 @@ extern "C" __global__ void __intersection__point()
    isect.t *= len;
  }

-  if (point_intersect(NULL, &isect, P, dir, isect.t, object, prim, time, type)) {
+  if (point_intersect(NULL, &isect, P, dir, tmin, isect.t, object, prim, time, type)) {
    static_assert(PRIMITIVE_ALL < 128, "Values >= 128 are reserved for OptiX internal use");
    optixReportIntersection(isect.t / len, type & PRIMITIVE_ALL);
  }
--- a/intern/cycles/kernel/geom/curve_intersect.h
+++ b/intern/cycles/kernel/geom/curve_intersect.h
@@ -156,7 +156,8 @@ ccl_device_inline float2 half_plane_intersect(const float3 P, const float3 N, co
 }

 ccl_device bool curve_intersect_iterative(const float3 ray_dir,
-                                          ccl_private float *ray_tfar,
+                                          const float ray_tmin,
+                                          ccl_private float *ray_tmax,
                                          const float dt,
                                          const float4 curve[4],
                                          float u,
@@ -220,7 +221,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,

    if (fabsf(f) < f_err && fabsf(g) < g_err) {
      t += dt;
-      if (!(0.0f <= t && t <= *ray_tfar)) {
+      if (!(t >= ray_tmin && t <= *ray_tmax)) {
        return false; /* Rejects NaNs */
      }
      if (!(u >= 0.0f && u <= 1.0f)) {
@@ -237,7 +238,7 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,
      }

      /* Record intersection. */
-      *ray_tfar = t;
+      *ray_tmax = t;
      isect->t = t;
      isect->u = u;
      isect->v = 0.0f;
@@ -250,7 +251,8 @@ ccl_device bool curve_intersect_iterative(const float3 ray_dir,

 ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                          const float3 ray_dir,
-                                          float ray_tfar,
+                                          const float ray_tmin,
+                                          float ray_tmax,
                                          float4 curve[4],
                                          ccl_private Intersection *isect)
 {
@@ -331,7 +333,7 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
      }

      /* Intersect with cap-planes. */
-      float2 tp = make_float2(-dt, ray_tfar - dt);
+      float2 tp = make_float2(ray_tmin - dt, ray_tmax - dt);
      tp = make_float2(max(tp.x, tc_outer.x), min(tp.y, tc_outer.y));
      const float2 h0 = half_plane_intersect(
          float4_to_float3(P0), float4_to_float3(dP0du), ray_dir);
@@ -394,19 +396,20 @@ ccl_device bool curve_intersect_recursive(const float3 ray_orig,
                                          CURVE_NUM_BEZIER_SUBDIVISIONS;
        if (depth >= termDepth) {
          found |= curve_intersect_iterative(
-              ray_dir, &ray_tfar, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
+              ray_dir, ray_tmin, &ray_tmax, dt, curve, u_outer0, tp0.x, use_backfacing, isect);
        }
        else {
          recurse = true;
        }
      }

-      if (valid1 && (tp1.x + dt <= ray_tfar)) {
+      const float t1 = tp1.x + dt;
+      if (valid1 && (t1 >= ray_tmin && t1 <= ray_tmax)) {
        const int termDepth = unstable1 ? CURVE_NUM_BEZIER_SUBDIVISIONS_UNSTABLE :
                                          CURVE_NUM_BEZIER_SUBDIVISIONS;
        if (depth >= termDepth) {
          found |= curve_intersect_iterative(
-              ray_dir, &ray_tfar, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
+              ray_dir, ray_tmin, &ray_tmax, dt, curve, u_outer1, tp1.y, use_backfacing, isect);
        }
        else {
          recurse = true;
@@ -456,7 +459,8 @@ ccl_device_inline bool cylinder_culling_test(const float2 p1, const float2 p2, c
 * v0,v1,v3 and v2,v3,v1. The edge v1,v2 decides which of the two
 * triangles gets intersected.
 */
-ccl_device_inline bool ribbon_intersect_quad(const float ray_tfar,
+ccl_device_inline bool ribbon_intersect_quad(const float ray_tmin,
+                                             const float ray_tmax,
                                             const float3 quad_v0,
                                             const float3 quad_v1,
                                             const float3 quad_v2,
@@ -497,7 +501,7 @@ ccl_device_inline bool ribbon_intersect_quad(const float ray_tfar,

  /* Perform depth test? */
  const float t = rcpDen * dot(v0, Ng);
-  if (!(0.0f <= t && t <= ray_tfar)) {
+  if (!(t >= ray_tmin && t <= ray_tmax)) {
    return false;
  }

@@ -534,7 +538,8 @@ ccl_device_inline float4 ribbon_to_ray_space(const float3 ray_space[3],

 ccl_device_inline bool ribbon_intersect(const float3 ray_org,
                                        const float3 ray_dir,
-                                        float ray_tfar,
+                                        const float ray_tmin,
+                                        float ray_tmax,
                                        const int N,
                                        float4 curve[4],
                                        ccl_private Intersection *isect)
@@ -582,7 +587,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,

      /* Intersect quad. */
      float vu, vv, vt;
-      bool valid0 = ribbon_intersect_quad(ray_tfar, lp0, lp1, up1, up0, &vu, &vv, &vt);
+      bool valid0 = ribbon_intersect_quad(ray_tmin, ray_tmax, lp0, lp1, up1, up0, &vu, &vv, &vt);

      if (valid0) {
        /* ignore self intersections */
@@ -596,7 +601,7 @@ ccl_device_inline bool ribbon_intersect(const float3 ray_org,
          vv = 2.0f * vv - 1.0f;

          /* Record intersection. */
-          ray_tfar = vt;
+          ray_tmax = vt;
          isect->t = vt;
          isect->u = u + vu * step_size;
          isect->v = vv;
@@ -616,6 +621,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
                                            ccl_private Intersection *isect,
                                            const float3 P,
                                            const float3 dir,
+                                            const float tmin,
                                            const float tmax,
                                            int object,
                                            int prim,
@@ -645,7 +651,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
  if (type & PRIMITIVE_CURVE_RIBBON) {
    /* todo: adaptive number of subdivisions could help performance here. */
    const int subdivisions = kernel_data.bvh.curve_subdivisions;
-    if (ribbon_intersect(P, dir, tmax, subdivisions, curve, isect)) {
+    if (ribbon_intersect(P, dir, tmin, tmax, subdivisions, curve, isect)) {
      isect->prim = prim;
      isect->object = object;
      isect->type = type;
@@ -655,7 +661,7 @@ ccl_device_forceinline bool curve_intersect(KernelGlobals kg,
    return false;
  }
  else {
-    if (curve_intersect_recursive(P, dir, tmax, curve, isect)) {
+    if (curve_intersect_recursive(P, dir, tmin, tmax, curve, isect)) {
      isect->prim = prim;
      isect->object = object;
      isect->type = type;
--- a/intern/cycles/kernel/geom/motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/motion_triangle_intersect.h
@@ -46,6 +46,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg,
                                                 ccl_private Intersection *isect,
                                                 float3 P,
                                                 float3 dir,
+                                                 float tmin,
                                                 float tmax,
                                                 float time,
                                                 uint visibility,
@@ -58,7 +59,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg,
  motion_triangle_vertices(kg, object, prim, time, verts);
  /* Ray-triangle intersection, unoptimized. */
  float t, u, v;
-  if (ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
+  if (ray_triangle_intersect(P, dir, tmin, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
    /* Visibility flag test. we do it here under the assumption
     * that most triangles are culled by node flags.
@@ -92,6 +93,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg,
                                                       int object,
                                                       int prim,
                                                       int prim_addr,
+                                                       float tmin,
                                                       float tmax,
                                                       ccl_private uint *lcg_state,
                                                       int max_hits)
@@ -101,7 +103,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg,
  motion_triangle_vertices(kg, object, prim, time, verts);
  /* Ray-triangle intersection, unoptimized. */
  float t, u, v;
-  if (!ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
+  if (!ray_triangle_intersect(P, dir, tmin, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
    return false;
  }

--- a/intern/cycles/kernel/geom/point_intersect.h
+++ b/intern/cycles/kernel/geom/point_intersect.h
@@ -9,8 +9,12 @@ CCL_NAMESPACE_BEGIN

 #ifdef __POINTCLOUD__

-ccl_device_forceinline bool point_intersect_test(
-    const float4 point, const float3 P, const float3 dir, const float tmax, ccl_private float *t)
+ccl_device_forceinline bool point_intersect_test(const float4 point,
+                                                 const float3 P,
+                                                 const float3 dir,
+                                                 const float tmin,
+                                                 const float tmax,
+                                                 ccl_private float *t)
 {
  const float3 center = float4_to_float3(point);
  const float radius = point.w;
@@ -28,12 +32,12 @@ ccl_device_forceinline bool point_intersect_test(

  const float td = sqrt((r2 - l2) * rd2);
  const float t_front = projC0 - td;
-  const bool valid_front = (0.0f <= t_front) & (t_front <= tmax);
+  const bool valid_front = (tmin <= t_front) & (t_front <= tmax);

  /* Always back-face culling for now. */
 #  if 0
  const float t_back = projC0 + td;
-  const bool valid_back = (0.0f <= t_back) & (t_back <= tmax);
+  const bool valid_back = (tmin <= t_back) & (t_back <= tmax);

  /* check if there is a first hit */
  const bool valid_first = valid_front | valid_back;
@@ -56,6 +60,7 @@ ccl_device_forceinline bool point_intersect(KernelGlobals kg,
                                            ccl_private Intersection *isect,
                                            const float3 P,
                                            const float3 dir,
+                                            const float tmin,
                                            const float tmax,
                                            const int object,
                                            const int prim,
@@ -65,7 +70,7 @@ ccl_device_forceinline bool point_intersect(KernelGlobals kg,
  const float4 point = (type & PRIMITIVE_MOTION) ? motion_point(kg, object, prim, time) :
                                                   kernel_data_fetch(points, prim);

-  if (!point_intersect_test(point, P, dir, tmax, &isect->t)) {
+  if (!point_intersect_test(point, P, dir, tmin, tmax, &isect->t)) {
    return false;
  }

--- a/intern/cycles/kernel/geom/shader_data.h
+++ b/intern/cycles/kernel/geom/shader_data.h
@@ -407,7 +407,7 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals kg,
 {

  /* vectors */
-  sd->P = ray->P;
+  sd->P = ray->P + ray->D * ray->tmin;
  sd->N = -ray->D;
  sd->Ng = -ray->D;
  sd->I = -ray->D;
@@ -441,7 +441,6 @@ ccl_device_inline void shader_setup_from_volume(KernelGlobals kg,

  /* for NDC coordinates */
  sd->ray_P = ray->P;
-  sd->ray_dP = ray->dP;
 }
 #endif /* __VOLUME__ */

--- a/intern/cycles/kernel/geom/triangle_intersect.h
+++ b/intern/cycles/kernel/geom/triangle_intersect.h
@@ -17,6 +17,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg,
                                          ccl_private Intersection *isect,
                                          float3 P,
                                          float3 dir,
+                                          float tmin,
                                          float tmax,
                                          uint visibility,
                                          int object,
@@ -28,7 +29,7 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg,
               tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
               tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
  float t, u, v;
-  if (ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
+  if (ray_triangle_intersect(P, dir, tmin, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
    /* Visibility flag test. we do it here under the assumption
     * that most triangles are culled by node flags.
@@ -62,6 +63,7 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
                                                int object,
                                                int prim,
                                                int prim_addr,
+                                                float tmin,
                                                float tmax,
                                                ccl_private uint *lcg_state,
                                                int max_hits)
@@ -71,7 +73,7 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
               tri_b = kernel_data_fetch(tri_verts, tri_vindex + 1),
               tri_c = kernel_data_fetch(tri_verts, tri_vindex + 2);
  float t, u, v;
-  if (!ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
+  if (!ray_triangle_intersect(P, dir, tmin, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
    return false;
  }

--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -174,14 +174,15 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
    Ray ray ccl_optional_struct_init;
    ray.P = zero_float3();
    ray.D = normalize(P);
-    ray.t = FLT_MAX;
+    ray.tmin = 0.0f;
+    ray.tmax = FLT_MAX;
    ray.time = 0.5f;
    ray.dP = differential_zero_compact();
    ray.dD = differential_zero_compact();
    integrator_state_write_ray(kg, state, &ray);

    /* Setup next kernel to execute. */
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
  }
  else {
    /* Surface baking. */
@@ -210,7 +211,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
    Ray ray ccl_optional_struct_init;
    ray.P = P + N;
    ray.D = -N;
-    ray.t = FLT_MAX;
+    ray.tmin = 0.0f;
+    ray.tmax = FLT_MAX;
    ray.time = 0.5f;

    /* Setup differentials. */
@@ -247,13 +249,15 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
    const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);

    if (use_caustics) {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader_index);
+      integrator_path_init_sorted(
+          kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader_index);
    }
    else if (use_raytrace_kernel) {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
+      integrator_path_init_sorted(
+          kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader_index);
    }
    else {
-      INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader_index);
+      integrator_path_init_sorted(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader_index);
    }
  }

--- a/intern/cycles/kernel/integrator/init_from_camera.h
+++ b/intern/cycles/kernel/integrator/init_from_camera.h
@@ -86,7 +86,7 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
    /* Generate camera ray. */
    Ray ray;
    integrate_camera_sample(kg, sample, x, y, rng_hash, &ray);
-    if (ray.t == 0.0f) {
+    if (ray.tmax == 0.0f) {
      return true;
    }

@@ -100,10 +100,10 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
  /* Continue with intersect_closest kernel, optionally initializing volume
   * stack before that if the camera may be inside a volume. */
  if (kernel_data.cam.is_inside_volume) {
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
  }
  else {
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+    integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
  }

  return true;
--- a/intern/cycles/kernel/integrator/intersect_closest.h
+++ b/intern/cycles/kernel/integrator/intersect_closest.h
@@ -109,14 +109,14 @@ ccl_device_forceinline void integrator_split_shadow_catcher(
    /* If using background pass, schedule background shading kernel so that we have a background
     * to alpha-over on. The background kernel will then continue the path afterwards. */
    INTEGRATOR_STATE_WRITE(state, path, flag) |= PATH_RAY_SHADOW_CATCHER_BACKGROUND;
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
    return;
  }

  if (!integrator_state_volume_stack_is_empty(kg, state)) {
    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
     * objects from it, and then continue shading volume and shadow catcher surface after. */
-    INTEGRATOR_PATH_INIT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    integrator_path_init(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
    return;
  }

@@ -128,18 +128,19 @@ ccl_device_forceinline void integrator_split_shadow_catcher(
  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);

  if (use_caustics) {
-    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+    integrator_path_init_sorted(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
  }
  else if (use_raytrace_kernel) {
-    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    integrator_path_init_sorted(
+        kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
  }
  else {
-    INTEGRATOR_PATH_INIT_SORTED(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    integrator_path_init_sorted(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
  }
 }

 /* Schedule next kernel to be executed after updating volume stack for shadow catcher. */
-template<uint32_t current_kernel>
+template<DeviceKernel current_kernel>
 ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_volume(
    KernelGlobals kg, IntegratorState state)
 {
@@ -156,20 +157,21 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche
  const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);

  if (use_caustics) {
-    INTEGRATOR_PATH_NEXT_SORTED(
-        current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+    integrator_path_next_sorted(
+        kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
  }
  else if (use_raytrace_kernel) {
-    INTEGRATOR_PATH_NEXT_SORTED(
-        current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+    integrator_path_next_sorted(
+        kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
  }
  else {
-    INTEGRATOR_PATH_NEXT_SORTED(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+    integrator_path_next_sorted(
+        kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
  }
 }

 /* Schedule next kernel to be executed after executing background shader for shadow catcher. */
-template<uint32_t current_kernel>
+template<DeviceKernel current_kernel>
 ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catcher_background(
    KernelGlobals kg, IntegratorState state)
 {
@@ -177,7 +179,8 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche
  if (!integrator_state_volume_stack_is_empty(kg, state)) {
    /* Volume stack is not empty. Re-init the volume stack to exclude any non-shadow catcher
     * objects from it, and then continue shading volume and shadow catcher surface after. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
+    integrator_path_next(
+        kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK);
    return;
  }

@@ -190,7 +193,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_shadow_catche
 *
 * Note that current_kernel is a template value since making this a variable
 * leads to poor performance with CUDA atomics. */
-template<uint32_t current_kernel>
+template<DeviceKernel current_kernel>
 ccl_device_forceinline void integrator_intersect_next_kernel(
    KernelGlobals kg,
    IntegratorState state,
@@ -206,10 +209,10 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
    const int flags = (hit_surface) ? kernel_data_fetch(shaders, shader).flags : 0;

    if (!integrator_intersect_terminate(kg, state, flags)) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+      integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
    }
    else {
-      INTEGRATOR_PATH_TERMINATE(current_kernel);
+      integrator_path_terminate(kg, state, current_kernel);
    }
    return;
  }
@@ -218,7 +221,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
  if (hit) {
    /* Hit a surface, continue with light or surface kernel. */
    if (isect->type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
    }
    else {
      /* Hit a surface, continue with surface kernel unless terminated. */
@@ -231,16 +234,16 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
                                  (object_flags & SD_OBJECT_CAUSTICS);
        const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);
        if (use_caustics) {
-          INTEGRATOR_PATH_NEXT_SORTED(
-              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+          integrator_path_next_sorted(
+              kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
        }
        else if (use_raytrace_kernel) {
-          INTEGRATOR_PATH_NEXT_SORTED(
-              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+          integrator_path_next_sorted(
+              kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
        }
        else {
-          INTEGRATOR_PATH_NEXT_SORTED(
-              current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+          integrator_path_next_sorted(
+              kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
        }

 #ifdef __SHADOW_CATCHER__
@@ -249,13 +252,13 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
 #endif
      }
      else {
-        INTEGRATOR_PATH_TERMINATE(current_kernel);
+        integrator_path_terminate(kg, state, current_kernel);
      }
    }
  }
  else {
    /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
  }
 }

@@ -263,7 +266,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel(
 *
 * The logic here matches integrator_intersect_next_kernel, except that
 * volume shading and termination testing have already been done. */
-template<uint32_t current_kernel>
+template<DeviceKernel current_kernel>
 ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
    KernelGlobals kg,
    IntegratorState state,
@@ -273,7 +276,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
  if (isect->prim != PRIM_NONE) {
    /* Hit a surface, continue with light or surface kernel. */
    if (isect->type & PRIMITIVE_LAMP) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+      integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
      return;
    }
    else {
@@ -286,16 +289,16 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
      const bool use_raytrace_kernel = (flags & SD_HAS_RAYTRACE);

      if (use_caustics) {
-        INTEGRATOR_PATH_NEXT_SORTED(
-            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
+        integrator_path_next_sorted(
+            kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE, shader);
      }
      else if (use_raytrace_kernel) {
-        INTEGRATOR_PATH_NEXT_SORTED(
-            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
+        integrator_path_next_sorted(
+            kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE, shader);
      }
      else {
-        INTEGRATOR_PATH_NEXT_SORTED(
-            current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
+        integrator_path_next_sorted(
+            kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE, shader);
      }

 #ifdef __SHADOW_CATCHER__
@@ -307,7 +310,7 @@ ccl_device_forceinline void integrator_intersect_next_kernel_after_volume(
  }
  else {
    /* Nothing hit, continue with background kernel. */
-    INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+    integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
    return;
  }
 }
@@ -321,7 +324,7 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg,
  /* Read ray from integrator state into local memory. */
  Ray ray ccl_optional_struct_init;
  integrator_state_read_ray(kg, state, &ray);
-  kernel_assert(ray.t != 0.0f);
+  kernel_assert(ray.tmax != 0.0f);

  const uint visibility = path_state_ray_visibility(state);
  const int last_isect_prim = INTEGRATOR_STATE(state, isect, prim);
@@ -329,12 +332,12 @@ ccl_device void integrator_intersect_closest(KernelGlobals kg,

  /* Trick to use short AO rays to approximate indirect light at the end of the path. */
  if (path_state_ao_bounce(kg, state)) {
-    ray.t = kernel_data.integrator.ao_bounces_distance;
+    ray.tmax = kernel_data.integrator.ao_bounces_distance;

    if (last_isect_object != OBJECT_NONE) {
      const float object_ao_distance = kernel_data_fetch(objects, last_isect_object).ao_distance;
      if (object_ao_distance != 0.0f) {
-        ray.t = object_ao_distance;
+        ray.tmax = object_ao_distance;
      }
    }
  }
--- a/intern/cycles/kernel/integrator/intersect_shadow.h
+++ b/intern/cycles/kernel/integrator/intersect_shadow.h
@@ -162,7 +162,7 @@ ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowSt

  if (opaque_hit) {
    /* Hit an opaque surface, shadow path ends here. */
-    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
+    integrator_shadow_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
    return;
  }
  else {
@@ -171,7 +171,9 @@ ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowSt
     *
     * TODO: could also write to render buffer directly if no transparent shadows?
     * Could save a kernel execution for the common case. */
-    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
+    integrator_shadow_path_next(kg,
+                                state,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW,
                                DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
    return;
  }
--- a/intern/cycles/kernel/integrator/intersect_subsurface.h
+++ b/intern/cycles/kernel/integrator/intersect_subsurface.h
@@ -17,7 +17,7 @@ ccl_device void integrator_intersect_subsurface(KernelGlobals kg, IntegratorStat
  }
 #endif

-  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
+  integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/intersect_volume_stack.h
@@ -24,7 +24,8 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,

  Ray volume_ray ccl_optional_struct_init;
  volume_ray.P = from_P;
-  volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);
+  volume_ray.D = normalize_len(to_P - from_P, &volume_ray.tmax);
+  volume_ray.tmin = 0.0f;
  volume_ray.self.object = INTEGRATOR_STATE(state, isect, object);
  volume_ray.self.prim = INTEGRATOR_STATE(state, isect, prim);
  volume_ray.self.light_object = OBJECT_NONE;
@@ -58,12 +59,9 @@ ccl_device void integrator_volume_stack_update_for_subsurface(KernelGlobals kg,
    volume_stack_enter_exit(kg, state, stack_sd);

    /* Move ray forward. */
-    volume_ray.P = stack_sd->P;
+    volume_ray.tmin = intersection_t_offset(isect.t);
    volume_ray.self.object = isect.object;
    volume_ray.self.prim = isect.prim;
-    if (volume_ray.t != FLT_MAX) {
-      volume_ray.D = normalize_len(to_P - volume_ray.P, &volume_ray.t);
-    }
    ++step;
  }
 #endif
@@ -82,7 +80,8 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
  /* Trace ray in random direction. Any direction works, Z up is a guess to get the
   * fewest hits. */
  volume_ray.D = make_float3(0.0f, 0.0f, 1.0f);
-  volume_ray.t = FLT_MAX;
+  volume_ray.tmin = 0.0f;
+  volume_ray.tmax = FLT_MAX;
  volume_ray.self.object = OBJECT_NONE;
  volume_ray.self.prim = PRIM_NONE;
  volume_ray.self.light_object = OBJECT_NONE;
@@ -199,7 +198,7 @@ ccl_device void integrator_volume_stack_init(KernelGlobals kg, IntegratorState s
    }

    /* Move ray forward. */
-    volume_ray.P = stack_sd->P;
+    volume_ray.tmin = intersection_t_offset(isect.t);
    volume_ray.self.object = isect.object;
    volume_ray.self.prim = isect.prim;
    ++step;
@@ -222,7 +221,9 @@ ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorSt
  }
  else {
    /* Volume stack init for camera rays, continue with intersection of camera ray. */
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
+    integrator_path_next(kg,
+                         state,
+                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK,
                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
  }
 }
--- a/intern/cycles/kernel/integrator/mnee.h
+++ b/intern/cycles/kernel/integrator/mnee.h
@@ -137,8 +137,14 @@ ccl_device_forceinline void mnee_update_light_sample(KernelGlobals kg,
    }
  }
  else if (ls->type == LIGHT_AREA) {
+    float invarea = fabsf(klight->area.invarea);
    ls->D = normalize_len(ls->P - P, &ls->t);
-    ls->pdf = fabsf(klight->area.invarea);
+    ls->pdf = invarea;
+    if (klight->area.tan_spread > 0.f) {
+      ls->eval_fac = 0.25f * invarea;
+      ls->eval_fac *= light_spread_attenuation(
+          ls->D, ls->Ng, klight->area.tan_spread, klight->area.normalize_spread);
+    }
  }

  ls->pdf *= kernel_data.integrator.pdf_lights;
@@ -436,6 +442,7 @@ ccl_device_forceinline bool mnee_newton_solver(KernelGlobals kg,
  projection_ray.self.light_prim = PRIM_NONE;
  projection_ray.dP = differential_make_compact(sd->dP);
  projection_ray.dD = differential_zero_compact();
+  projection_ray.tmin = 0.0f;
  projection_ray.time = sd->time;
  Intersection projection_isect;

@@ -499,8 +506,8 @@ ccl_device_forceinline bool mnee_newton_solver(KernelGlobals kg,
        projection_ray.self.prim = pv.prim;
        projection_ray.P = pv.p;
      }
-      projection_ray.D = normalize_len(tentative_p - projection_ray.P, &projection_ray.t);
-      projection_ray.t *= MNEE_PROJECTION_DISTANCE_MULTIPLIER;
+      projection_ray.D = normalize_len(tentative_p - projection_ray.P, &projection_ray.tmax);
+      projection_ray.tmax *= MNEE_PROJECTION_DISTANCE_MULTIPLIER;

      bool projection_success = false;
      for (int isect_count = 0; isect_count < MNEE_MAX_INTERSECTION_COUNT; isect_count++) {
@@ -519,8 +526,7 @@ ccl_device_forceinline bool mnee_newton_solver(KernelGlobals kg,

        projection_ray.self.object = projection_isect.object;
        projection_ray.self.prim = projection_isect.prim;
-        projection_ray.P += projection_isect.t * projection_ray.D;
-        projection_ray.t -= projection_isect.t;
+        projection_ray.tmin = intersection_t_offset(projection_isect.t);
      }
      if (!projection_success) {
        reduce_stepsize = true;
@@ -852,6 +858,7 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
  Ray probe_ray;
  probe_ray.self.light_object = ls->object;
  probe_ray.self.light_prim = ls->prim;
+  probe_ray.tmin = 0.0f;
  probe_ray.dP = differential_make_compact(sd->dP);
  probe_ray.dD = differential_zero_compact();
  probe_ray.time = sd->time;
@@ -867,13 +874,13 @@ ccl_device_forceinline bool mnee_path_contribution(KernelGlobals kg,
    ccl_private const ManifoldVertex &v = vertices[vi];

    /* Check visibility. */
-    probe_ray.D = normalize_len(v.p - probe_ray.P, &probe_ray.t);
+    probe_ray.D = normalize_len(v.p - probe_ray.P, &probe_ray.tmax);
    if (scene_intersect(kg, &probe_ray, PATH_RAY_TRANSMIT, &probe_isect)) {
      int hit_object = (probe_isect.object == OBJECT_NONE) ?
                           kernel_data_fetch(prim_object, probe_isect.prim) :
                           probe_isect.object;
      /* Test whether the ray hit the appropriate object at its intended location. */
-      if (hit_object != v.object || fabsf(probe_ray.t - probe_isect.t) > MNEE_MIN_DISTANCE)
+      if (hit_object != v.object || fabsf(probe_ray.tmax - probe_isect.t) > MNEE_MIN_DISTANCE)
        return false;
    }
    probe_ray.self.object = v.object;
@@ -952,15 +959,16 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg,
  probe_ray.self.light_object = ls->object;
  probe_ray.self.light_prim = ls->prim;
  probe_ray.P = sd->P;
+  probe_ray.tmin = 0.0f;
  if (ls->t == FLT_MAX) {
    /* Distant / env light. */
    probe_ray.D = ls->D;
-    probe_ray.t = ls->t;
+    probe_ray.tmax = ls->t;
  }
  else {
    /* Other lights, avoid self-intersection. */
    probe_ray.D = ls->P - probe_ray.P;
-    probe_ray.D = normalize_len(probe_ray.D, &probe_ray.t);
+    probe_ray.D = normalize_len(probe_ray.D, &probe_ray.tmax);
  }
  probe_ray.dP = differential_make_compact(sd->dP);
  probe_ray.dD = differential_zero_compact();
@@ -1042,9 +1050,7 @@ ccl_device_forceinline int kernel_path_mnee_sample(KernelGlobals kg,

    probe_ray.self.object = probe_isect.object;
    probe_ray.self.prim = probe_isect.prim;
-    probe_ray.P += probe_isect.t * probe_ray.D;
-    if (ls->t != FLT_MAX)
-      probe_ray.t -= probe_isect.t;
+    probe_ray.tmin = intersection_t_offset(probe_isect.t);
  };

  /* Mark the manifold walk invalid to keep mollification on by default. */
--- a/intern/cycles/kernel/integrator/path_state.h
+++ b/intern/cycles/kernel/integrator/path_state.h
@@ -52,7 +52,6 @@ ccl_device_inline void path_state_init_integrator(KernelGlobals kg,
  INTEGRATOR_STATE_WRITE(state, path, flag) = PATH_RAY_CAMERA | PATH_RAY_MIS_SKIP |
                                              PATH_RAY_TRANSPARENT_BACKGROUND;
  INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = 0.0f;
-  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f;
  INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = FLT_MAX;
  INTEGRATOR_STATE_WRITE(state, path, continuation_probability) = 1.0f;
  INTEGRATOR_STATE_WRITE(state, path, throughput) = make_float3(1.0f, 1.0f, 1.0f);
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -62,11 +62,10 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg,
    const float3 ray_P = INTEGRATOR_STATE(state, ray, P);
    const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
    const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t);

    /* multiple importance sampling, get background light pdf for ray
     * direction, and compute weight with respect to BSDF pdf */
-    const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
+    const float pdf = background_light_pdf(kg, ray_P, ray_D);
    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
    L *= mis_weight;
  }
@@ -213,7 +212,7 @@ ccl_device void integrator_shade_background(KernelGlobals kg,
  }
 #endif

-  INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
+  integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -22,19 +22,8 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
  const float3 ray_D = INTEGRATOR_STATE(state, ray, D);
  const float ray_time = INTEGRATOR_STATE(state, ray, time);

-  /* Advance ray beyond light. */
-  /* TODO: can we make this more numerically robust to avoid reintersecting the
-   * same light in some cases? Ray should not intersect surface anymore as the
-   * object and prim ids will prevent self intersection. */
-  const float3 new_ray_P = ray_P + ray_D * isect.t;
-  INTEGRATOR_STATE_WRITE(state, ray, P) = new_ray_P;
-  INTEGRATOR_STATE_WRITE(state, ray, t) -= isect.t;
-
-  /* Set position to where the BSDF was sampled, for correct MIS PDF. */
-  const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t);
-  ray_P -= ray_D * mis_ray_t;
-  isect.t += mis_ray_t;
-  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = isect.t;
+  /* Advance ray to new start distance. */
+  INTEGRATOR_STATE_WRITE(state, ray, tmin) = intersection_t_offset(isect.t);

  LightSample ls ccl_optional_struct_init;
  const bool use_light_sample = light_sample_from_intersection(kg, &isect, ray_P, ray_D, &ls);
@@ -99,11 +88,13 @@ ccl_device void integrator_shade_light(KernelGlobals kg,
  INTEGRATOR_STATE_WRITE(state, path, transparent_bounce) = transparent_bounce;

  if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
-    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
+    integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT);
    return;
  }
  else {
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
+    integrator_path_next(kg,
+                         state,
+                         DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT,
                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
    return;
  }
--- a/intern/cycles/kernel/integrator/shade_shadow.h
+++ b/intern/cycles/kernel/integrator/shade_shadow.h
@@ -75,13 +75,9 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,
  ray.self.light_object = OBJECT_NONE;
  ray.self.light_prim = PRIM_NONE;
  /* Modify ray position and length to match current segment. */
-  const float start_t = (hit == 0) ? 0.0f :
-                                     INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit - 1, t);
-  const float end_t = (hit < num_recorded_hits) ?
-                          INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit, t) :
-                          ray.t;
-  ray.P += start_t * ray.D;
-  ray.t = end_t - start_t;
+  ray.tmin = (hit == 0) ? ray.tmin : INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit - 1, t);
+  ray.tmax = (hit < num_recorded_hits) ? INTEGRATOR_STATE_ARRAY(state, shadow_isect, hit, t) :
+                                         ray.tmax;

  shader_setup_from_volume(kg, shadow_sd, &ray);

@@ -137,10 +133,7 @@ ccl_device_inline bool integrate_transparent_shadow(KernelGlobals kg,
    /* There are more hits that we could not recorded due to memory usage,
     * adjust ray to intersect again from the last hit. */
    const float last_hit_t = INTEGRATOR_STATE_ARRAY(state, shadow_isect, num_recorded_hits - 1, t);
-    const float3 ray_P = INTEGRATOR_STATE(state, shadow_ray, P);
-    const float3 ray_D = INTEGRATOR_STATE(state, shadow_ray, D);
-    INTEGRATOR_STATE_WRITE(state, shadow_ray, P) = ray_P + last_hit_t * ray_D;
-    INTEGRATOR_STATE_WRITE(state, shadow_ray, t) -= last_hit_t;
+    INTEGRATOR_STATE_WRITE(state, shadow_ray, tmin) = intersection_t_offset(last_hit_t);
  }

  return false;
@@ -158,20 +151,22 @@ ccl_device void integrator_shade_shadow(KernelGlobals kg,
  /* Evaluate transparent shadows. */
  const bool opaque = integrate_transparent_shadow(kg, state, num_hits);
  if (opaque) {
-    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    integrator_shadow_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
    return;
  }
 #endif

  if (shadow_intersections_has_remaining(num_hits)) {
    /* More intersections to find, continue shadow ray. */
-    INTEGRATOR_SHADOW_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
+    integrator_shadow_path_next(kg,
+                                state,
+                                DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW,
                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW);
    return;
  }
  else {
    kernel_accum_light(kg, state, render_buffer);
-    INTEGRATOR_SHADOW_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
+    integrator_shadow_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW);
    return;
  }
 }
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -77,7 +77,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
 #  endif
  {
    const float bsdf_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    const float t = sd->ray_length + INTEGRATOR_STATE(state, path, mis_ray_t);
+    const float t = sd->ray_length;

    /* Multiple importance sampling, get triangle light pdf,
     * and compute weight with respect to BSDF pdf. */
@@ -190,8 +190,8 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
  const bool is_light = light_sample_is_light(&ls);

  /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(
-      shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);
+  IntegratorShadowState shadow_state = integrator_shadow_path_init(
+      kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, false);

  /* Copy volume stack and enter/exit volume. */
  integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
@@ -323,16 +323,21 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
    return LABEL_NONE;
  }

-  /* Setup ray. Note that clipping works through transparent bounces. */
-  INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
-  INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in);
-  INTEGRATOR_STATE_WRITE(state, ray, t) = (label & LABEL_TRANSPARENT) ?
-                                              INTEGRATOR_STATE(state, ray, t) - sd->ray_length :
-                                              FLT_MAX;
+  if (label & LABEL_TRANSPARENT) {
+    /* Only need to modify start distance for transparent. */
+    INTEGRATOR_STATE_WRITE(state, ray, tmin) = intersection_t_offset(sd->ray_length);
+  }
+  else {
+    /* Setup ray with changed origin and direction. */
+    INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
+    INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(bsdf_omega_in);
+    INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f;
+    INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
 #ifdef __RAY_DIFFERENTIALS__
-  INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
-  INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(bsdf_domega_in);
+    INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
+    INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(bsdf_domega_in);
 #endif
+  }

  /* Update throughput. */
  float3 throughput = INTEGRATOR_STATE(state, path, throughput);
@@ -349,12 +354,8 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(
  }

  /* Update path state */
-  if (label & LABEL_TRANSPARENT) {
-    INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) += sd->ray_length;
-  }
-  else {
+  if (!(label & LABEL_TRANSPARENT)) {
    INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = bsdf_pdf;
-    INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f;
    INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = fminf(
        bsdf_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf));
  }
@@ -371,17 +372,8 @@ ccl_device_forceinline int integrate_surface_volume_only_bounce(IntegratorState
    return LABEL_NONE;
  }

-  /* Setup ray position, direction stays unchanged. */
-  INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
-
-  /* Clipping works through transparent. */
-  INTEGRATOR_STATE_WRITE(state, ray, t) -= sd->ray_length;
-
-#  ifdef __RAY_DIFFERENTIALS__
-  INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
-#  endif
-
-  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) += sd->ray_length;
+  /* Only modify start distance. */
+  INTEGRATOR_STATE_WRITE(state, ray, tmin) = intersection_t_offset(sd->ray_length);

  return LABEL_TRANSMIT | LABEL_TRANSPARENT;
 }
@@ -432,7 +424,8 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
  Ray ray ccl_optional_struct_init;
  ray.P = shadow_ray_offset(kg, sd, ao_D, &skip_self);
  ray.D = ao_D;
-  ray.t = kernel_data.integrator.ao_bounces_distance;
+  ray.tmin = 0.0f;
+  ray.tmax = kernel_data.integrator.ao_bounces_distance;
  ray.time = sd->time;
  ray.self.object = (skip_self) ? sd->object : OBJECT_NONE;
  ray.self.prim = (skip_self) ? sd->prim : PRIM_NONE;
@@ -442,7 +435,8 @@ ccl_device_forceinline void integrate_surface_ao(KernelGlobals kg,
  ray.dD = differential_zero_compact();

  /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, ao);
+  IntegratorShadowState shadow_state = integrator_shadow_path_init(
+      kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, true);

  /* Copy volume stack and enter/exit volume. */
  integrator_state_copy_volume_stack_to_shadow(kg, shadow_state, state);
@@ -604,22 +598,23 @@ ccl_device bool integrate_surface(KernelGlobals kg,
 }

 template<uint node_feature_mask = KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE,
-         int current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE>
+         DeviceKernel current_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE>
 ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg,
                                                     IntegratorState state,
                                                     ccl_global float *ccl_restrict render_buffer)
 {
  if (integrate_surface<node_feature_mask>(kg, state, render_buffer)) {
    if (INTEGRATOR_STATE(state, path, flag) & PATH_RAY_SUBSURFACE) {
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
+      integrator_path_next(
+          kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
    }
    else {
-      kernel_assert(INTEGRATOR_STATE(state, ray, t) != 0.0f);
-      INTEGRATOR_PATH_NEXT(current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
+      kernel_assert(INTEGRATOR_STATE(state, ray, tmax) != 0.0f);
+      integrator_path_next(kg, state, current_kernel, DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
    }
  }
  else {
-    INTEGRATOR_PATH_TERMINATE(current_kernel);
+    integrator_path_terminate(kg, state, current_kernel);
  }
 }

--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -114,7 +114,8 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg,
 ccl_device_forceinline void volume_step_init(KernelGlobals kg,
                                             ccl_private const RNGState *rng_state,
                                             const float object_step_size,
-                                             float t,
+                                             const float tmin,
+                                             const float tmax,
                                             ccl_private float *step_size,
                                             ccl_private float *step_shade_offset,
                                             ccl_private float *steps_offset,
@@ -122,7 +123,7 @@ ccl_device_forceinline void volume_step_init(KernelGlobals kg,
 {
  if (object_step_size == FLT_MAX) {
    /* Homogeneous volume. */
-    *step_size = t;
+    *step_size = tmax - tmin;
    *step_shade_offset = 0.0f;
    *steps_offset = 1.0f;
    *max_steps = 1;
@@ -130,6 +131,7 @@ ccl_device_forceinline void volume_step_init(KernelGlobals kg,
  else {
    /* Heterogeneous volume. */
    *max_steps = kernel_data.integrator.volume_max_steps;
+    const float t = tmax - tmin;
    float step = min(object_step_size, t);

    /* compute exact steps in advance for malloc */
@@ -165,7 +167,7 @@ ccl_device void volume_shadow_homogeneous(KernelGlobals kg, IntegratorState stat
  float3 sigma_t = zero_float3();

  if (shadow_volume_shader_sample(kg, state, sd, &sigma_t)) {
-    *throughput *= volume_color_transmittance(sigma_t, ray->t);
+    *throughput *= volume_color_transmittance(sigma_t, ray->tmax - ray->tmin);
  }
 }
 #  endif
@@ -194,7 +196,8 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
  volume_step_init(kg,
                   &rng_state,
                   object_step_size,
-                   ray->t,
+                   ray->tmin,
+                   ray->tmax,
                   &step_size,
                   &step_shade_offset,
                   &unused,
@@ -202,13 +205,13 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,
  const float steps_offset = 1.0f;

  /* compute extinction at the start */
-  float t = 0.0f;
+  float t = ray->tmin;

  float3 sum = zero_float3();

  for (int i = 0; i < max_steps; i++) {
    /* advance to new position */
-    float new_t = min(ray->t, (i + steps_offset) * step_size);
+    float new_t = min(ray->tmax, ray->tmin + (i + steps_offset) * step_size);
    float dt = new_t - t;

    float3 new_P = ray->P + ray->D * (t + dt * step_shade_offset);
@@ -233,7 +236,7 @@ ccl_device void volume_shadow_heterogeneous(KernelGlobals kg,

    /* stop if at the end of the volume */
    t = new_t;
-    if (t == ray->t) {
+    if (t == ray->tmax) {
      /* Update throughput in case we haven't done it above */
      tp = *throughput * exp(sum);
      break;
@@ -257,15 +260,16 @@ ccl_device float volume_equiangular_sample(ccl_private const Ray *ccl_restrict r
                                           const float xi,
                                           ccl_private float *pdf)
 {
-  const float t = ray->t;
+  const float tmin = ray->tmin;
+  const float tmax = ray->tmax;
  const float delta = dot((light_P - ray->P), ray->D);
  const float D = safe_sqrtf(len_squared(light_P - ray->P) - delta * delta);
  if (UNLIKELY(D == 0.0f)) {
    *pdf = 0.0f;
    return 0.0f;
  }
-  const float theta_a = -atan2f(delta, D);
-  const float theta_b = atan2f(t - delta, D);
+  const float theta_a = atan2f(tmin - delta, D);
+  const float theta_b = atan2f(tmax - delta, D);
  const float t_ = D * tanf((xi * theta_b) + (1 - xi) * theta_a);
  if (UNLIKELY(theta_b == theta_a)) {
    *pdf = 0.0f;
@@ -273,7 +277,7 @@ ccl_device float volume_equiangular_sample(ccl_private const Ray *ccl_restrict r
  }
  *pdf = D / ((theta_b - theta_a) * (D * D + t_ * t_));

-  return min(t, delta + t_); /* min is only for float precision errors */
+  return clamp(delta + t_, tmin, tmax); /* clamp is only for float precision errors */
 }

 ccl_device float volume_equiangular_pdf(ccl_private const Ray *ccl_restrict ray,
@@ -286,11 +290,12 @@ ccl_device float volume_equiangular_pdf(ccl_private const Ray *ccl_restrict ray,
    return 0.0f;
  }

-  const float t = ray->t;
+  const float tmin = ray->tmin;
+  const float tmax = ray->tmax;
  const float t_ = sample_t - delta;

-  const float theta_a = -atan2f(delta, D);
-  const float theta_b = atan2f(t - delta, D);
+  const float theta_a = atan2f(tmin - delta, D);
+  const float theta_b = atan2f(tmax - delta, D);
  if (UNLIKELY(theta_b == theta_a)) {
    return 0.0f;
  }
@@ -310,11 +315,12 @@ ccl_device float volume_equiangular_cdf(ccl_private const Ray *ccl_restrict ray,
    return 0.0f;
  }

-  const float t = ray->t;
+  const float tmin = ray->tmin;
+  const float tmax = ray->tmax;
  const float t_ = sample_t - delta;

-  const float theta_a = -atan2f(delta, D);
-  const float theta_b = atan2f(t - delta, D);
+  const float theta_a = atan2f(tmin - delta, D);
+  const float theta_b = atan2f(tmax - delta, D);
  if (UNLIKELY(theta_b == theta_a)) {
    return 0.0f;
  }
@@ -390,8 +396,8 @@ ccl_device float3 volume_emission_integrate(ccl_private VolumeShaderCoefficients

 typedef struct VolumeIntegrateState {
  /* Volume segment extents. */
-  float start_t;
-  float end_t;
+  float tmin;
+  float tmax;

  /* If volume is absorption-only up to this point, and no probabilistic
   * scattering or termination has been used yet. */
@@ -426,9 +432,9 @@ ccl_device_forceinline void volume_integrate_step_scattering(

  /* Equiangular sampling for direct lighting. */
  if (vstate.direct_sample_method == VOLUME_SAMPLE_EQUIANGULAR && !result.direct_scatter) {
-    if (result.direct_t >= vstate.start_t && result.direct_t <= vstate.end_t &&
+    if (result.direct_t >= vstate.tmin && result.direct_t <= vstate.tmax &&
        vstate.equiangular_pdf > VOLUME_SAMPLE_PDF_CUTOFF) {
-      const float new_dt = result.direct_t - vstate.start_t;
+      const float new_dt = result.direct_t - vstate.tmin;
      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);

      result.direct_scatter = true;
@@ -458,7 +464,7 @@ ccl_device_forceinline void volume_integrate_step_scattering(
      /* compute sampling distance */
      const float sample_sigma_t = volume_channel_get(coeff.sigma_t, channel);
      const float new_dt = -logf(1.0f - vstate.rscatter) / sample_sigma_t;
-      const float new_t = vstate.start_t + new_dt;
+      const float new_t = vstate.tmin + new_dt;

      /* transmittance and pdf */
      const float3 new_transmittance = volume_color_transmittance(coeff.sigma_t, new_dt);
@@ -528,7 +534,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
  volume_step_init(kg,
                   rng_state,
                   object_step_size,
-                   ray->t,
+                   ray->tmin,
+                   ray->tmax,
                   &step_size,
                   &step_shade_offset,
                   &steps_offset,
@@ -536,8 +543,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous(

  /* Initialize volume integration state. */
  VolumeIntegrateState vstate ccl_optional_struct_init;
-  vstate.start_t = 0.0f;
-  vstate.end_t = 0.0f;
+  vstate.tmin = ray->tmin;
+  vstate.tmax = ray->tmin;
  vstate.absorption_only = true;
  vstate.rscatter = path_state_rng_1D(kg, rng_state, PRNG_SCATTER_DISTANCE);
  vstate.rphase = path_state_rng_1D(kg, rng_state, PRNG_PHASE_CHANNEL);
@@ -578,8 +585,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous(

  for (int i = 0; i < max_steps; i++) {
    /* Advance to new position */
-    vstate.end_t = min(ray->t, (i + steps_offset) * step_size);
-    const float shade_t = vstate.start_t + (vstate.end_t - vstate.start_t) * step_shade_offset;
+    vstate.tmax = min(ray->tmax, ray->tmin + (i + steps_offset) * step_size);
+    const float shade_t = vstate.tmin + (vstate.tmax - vstate.tmin) * step_shade_offset;
    sd->P = ray->P + ray->D * shade_t;

    /* compute segment */
@@ -588,7 +595,7 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
      const int closure_flag = sd->flag;

      /* Evaluate transmittance over segment. */
-      const float dt = (vstate.end_t - vstate.start_t);
+      const float dt = (vstate.tmax - vstate.tmin);
      const float3 transmittance = (closure_flag & SD_EXTINCTION) ?
                                       volume_color_transmittance(coeff.sigma_t, dt) :
                                       one_float3();
@@ -645,8 +652,8 @@ ccl_device_forceinline void volume_integrate_heterogeneous(
    }

    /* Stop if at the end of the volume. */
-    vstate.start_t = vstate.end_t;
-    if (vstate.start_t == ray->t) {
+    vstate.tmin = vstate.tmax;
+    if (vstate.tmin == ray->tmax) {
      break;
    }
  }
@@ -774,8 +781,8 @@ ccl_device_forceinline void integrate_volume_direct_light(
  const bool is_light = light_sample_is_light(ls);

  /* Branch off shadow kernel. */
-  INTEGRATOR_SHADOW_PATH_INIT(
-      shadow_state, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, shadow);
+  IntegratorShadowState shadow_state = integrator_shadow_path_init(
+      kg, state, DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW, false);

  /* Write shadow ray and associated state to global memory. */
  integrator_state_write_shadow_ray(kg, shadow_state, &ray);
@@ -880,7 +887,8 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(
  /* Setup ray. */
  INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
  INTEGRATOR_STATE_WRITE(state, ray, D) = normalize(phase_omega_in);
-  INTEGRATOR_STATE_WRITE(state, ray, t) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f;
+  INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
 #  ifdef __RAY_DIFFERENTIALS__
  INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
  INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_make_compact(phase_domega_in);
@@ -901,7 +909,6 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(

  /* Update path state */
  INTEGRATOR_STATE_WRITE(state, path, mis_ray_pdf) = phase_pdf;
-  INTEGRATOR_STATE_WRITE(state, path, mis_ray_t) = 0.0f;
  INTEGRATOR_STATE_WRITE(state, path, min_ray_pdf) = fminf(
      phase_pdf, INTEGRATOR_STATE(state, path, min_ray_pdf));

@@ -1021,7 +1028,7 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,
  integrator_state_read_isect(kg, state, &isect);

  /* Set ray length to current segment. */
-  ray.t = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX;
+  ray.tmax = (isect.prim != PRIM_NONE) ? isect.t : FLT_MAX;

  /* Clean volume stack for background rays. */
  if (isect.prim == PRIM_NONE) {
@@ -1032,13 +1039,15 @@ ccl_device void integrator_shade_volume(KernelGlobals kg,

  if (event == VOLUME_PATH_SCATTERED) {
    /* Queue intersect_closest kernel. */
-    INTEGRATOR_PATH_NEXT(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
+    integrator_path_next(kg,
+                         state,
+                         DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME,
                         DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST);
    return;
  }
  else if (event == VOLUME_PATH_MISSED) {
    /* End path. */
-    INTEGRATOR_PATH_TERMINATE(DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
+    integrator_path_terminate(kg, state, DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME);
    return;
  }
  else {
--- a/intern/cycles/kernel/integrator/shadow_catcher.h
+++ b/intern/cycles/kernel/integrator/shadow_catcher.h
@@ -50,7 +50,7 @@ ccl_device_inline bool kernel_shadow_catcher_is_path_split_bounce(KernelGlobals
 ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg,
                                                            ConstIntegratorState state)
 {
-  if (INTEGRATOR_PATH_IS_TERMINATED) {
+  if (integrator_path_is_terminated(state)) {
    return false;
  }

--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -47,7 +47,8 @@ KERNEL_STRUCT_END(shadow_path)
 KERNEL_STRUCT_BEGIN(shadow_ray)
 KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, tmin, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, float, tmax, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, int, object, KERNEL_FEATURE_PATH_TRACING)
--- a/intern/cycles/kernel/integrator/state.h
+++ b/intern/cycles/kernel/integrator/state.h
@@ -127,6 +127,9 @@ typedef struct IntegratorStateGPU {

  /* Index of main path which will be used by a next shadow catcher split.  */
  ccl_global int *next_main_path_index;
+
+  /* Divisor used to partition active indices by locality when sorting by material.  */
+  uint sort_partition_divisor;
 } IntegratorStateGPU;

 /* Abstraction
--- a/intern/cycles/kernel/integrator/state_flow.h
+++ b/intern/cycles/kernel/integrator/state_flow.h
@@ -10,125 +10,196 @@ CCL_NAMESPACE_BEGIN

 /* Control Flow
 *
- * Utilities for control flow between kernels. The implementation may differ per device
- * or even be handled on the host side. To abstract such differences, experiment with
- * different implementations and for debugging, this is abstracted using macros.
+ * Utilities for control flow between kernels. The implementation is different between CPU and
+ * GPU devices. For the latter part of the logic is handled on the host side with wavefronts.
 *
 * There is a main path for regular path tracing camera for path tracing. Shadows for next
 * event estimation branch off from this into their own path, that may be computed in
- * parallel while the main path continues.
+ * parallel while the main path continues. Additionally, shading kernels are sorted using
+ * a key for coherence.
 *
 * Each kernel on the main path must call one of these functions. These may not be called
 * multiple times from the same kernel.
 *
- * INTEGRATOR_PATH_INIT(next_kernel)
- * INTEGRATOR_PATH_NEXT(current_kernel, next_kernel)
- * INTEGRATOR_PATH_TERMINATE(current_kernel)
+ * integrator_path_init(kg, state, next_kernel)
+ * integrator_path_next(kg, state, current_kernel, next_kernel)
+ * integrator_path_terminate(kg, state, current_kernel)
 *
 * For the shadow path similar functions are used, and again each shadow kernel must call
 * one of them, and only once.
 */

-#define INTEGRATOR_PATH_IS_TERMINATED (INTEGRATOR_STATE(state, path, queued_kernel) == 0)
-#define INTEGRATOR_SHADOW_PATH_IS_TERMINATED \
-  (INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0)
+ccl_device_forceinline bool integrator_path_is_terminated(ConstIntegratorState state)
+{
+  return INTEGRATOR_STATE(state, path, queued_kernel) == 0;
+}
+
+ccl_device_forceinline bool integrator_shadow_path_is_terminated(ConstIntegratorShadowState state)
+{
+  return INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0;
+}

 #ifdef __KERNEL_GPU__

-#  define INTEGRATOR_PATH_INIT(next_kernel) \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+ccl_device_forceinline void integrator_path_init(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel next_kernel)
+{
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+}

-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
-    IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32( \
-        &kernel_integrator_state.next_shadow_path_index[0], 1); \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], \
-                                1); \
-    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
-    atomic_fetch_and_sub_uint32( \
-        &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-    INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+ccl_device_forceinline void integrator_path_next(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel current_kernel,
+                                                 const DeviceKernel next_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+}

-#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
-    { \
-      const int key_ = key; \
-      atomic_fetch_and_add_uint32( \
-          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; \
-      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
-                                  1); \
-    }
-#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
-    { \
-      const int key_ = key; \
-      atomic_fetch_and_sub_uint32( \
-          &kernel_integrator_state.queue_counter->num_queued[current_kernel], 1); \
-      atomic_fetch_and_add_uint32( \
-          &kernel_integrator_state.queue_counter->num_queued[next_kernel], 1); \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_; \
-      atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], \
-                                  1); \
-    }
+ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      const DeviceKernel current_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+}
+
+ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
+    KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao)
+{
+  IntegratorShadowState shadow_state = atomic_fetch_and_add_uint32(
+      &kernel_integrator_state.next_shadow_path_index[0], 1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+  return shadow_state;
+}
+
+ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg,
+                                                        IntegratorShadowState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
+}
+
+ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
+                                                             IntegratorShadowState state,
+                                                             const DeviceKernel current_kernel)
+{
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+}
+
+/* Sort first by truncated state index (for good locality), then by key (for good coherence). */
+#  define INTEGRATOR_SORT_KEY(key, state) \
+    (key + kernel_data.max_shaders * (state / kernel_integrator_state.sort_partition_divisor))
+
+ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  const int key_ = INTEGRATOR_SORT_KEY(key, state);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
+}
+
+ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  const int key_ = INTEGRATOR_SORT_KEY(key, state);
+  atomic_fetch_and_sub_uint32(&kernel_integrator_state.queue_counter->num_queued[current_kernel],
+                              1);
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.queue_counter->num_queued[next_kernel], 1);
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  INTEGRATOR_STATE_WRITE(state, path, shader_sort_key) = key_;
+  atomic_fetch_and_add_uint32(&kernel_integrator_state.sort_key_counter[next_kernel][key_], 1);
+}

 #else

-#  define INTEGRATOR_PATH_INIT(next_kernel) \
-    INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_PATH_INIT_SORTED(next_kernel, key) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      (void)key; \
-    }
-#  define INTEGRATOR_PATH_NEXT(current_kernel, next_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      (void)current_kernel; \
-    }
-#  define INTEGRATOR_PATH_TERMINATE(current_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0; \
-      (void)current_kernel; \
-    }
-#  define INTEGRATOR_PATH_NEXT_SORTED(current_kernel, next_kernel, key) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel; \
-      (void)key; \
-      (void)current_kernel; \
-    }
+ccl_device_forceinline void integrator_path_init(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel next_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+}

-#  define INTEGRATOR_SHADOW_PATH_INIT(shadow_state, state, next_kernel, shadow_type) \
-    IntegratorShadowState shadow_state = &state->shadow_type; \
-    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
-#  define INTEGRATOR_SHADOW_PATH_NEXT(current_kernel, next_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel; \
-      (void)current_kernel; \
-    }
-#  define INTEGRATOR_SHADOW_PATH_TERMINATE(current_kernel) \
-    { \
-      INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0; \
-      (void)current_kernel; \
-    }
+ccl_device_forceinline void integrator_path_init_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  (void)key;
+}
+
+ccl_device_forceinline void integrator_path_next(KernelGlobals kg,
+                                                 IntegratorState state,
+                                                 const DeviceKernel current_kernel,
+                                                 const DeviceKernel next_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline void integrator_path_terminate(KernelGlobals kg,
+                                                      IntegratorState state,
+                                                      const DeviceKernel current_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline void integrator_path_next_sorted(KernelGlobals kg,
+                                                        IntegratorState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel,
+                                                        const uint32_t key)
+{
+  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = next_kernel;
+  (void)key;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline IntegratorShadowState integrator_shadow_path_init(
+    KernelGlobals kg, IntegratorState state, const DeviceKernel next_kernel, const bool is_ao)
+{
+  IntegratorShadowState shadow_state = (is_ao) ? &state->ao : &state->shadow;
+  INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, queued_kernel) = next_kernel;
+  return shadow_state;
+}
+
+ccl_device_forceinline void integrator_shadow_path_next(KernelGlobals kg,
+                                                        IntegratorShadowState state,
+                                                        const DeviceKernel current_kernel,
+                                                        const DeviceKernel next_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = next_kernel;
+  (void)current_kernel;
+}
+
+ccl_device_forceinline void integrator_shadow_path_terminate(KernelGlobals kg,
+                                                             IntegratorShadowState state,
+                                                             const DeviceKernel current_kernel)
+{
+  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
+  (void)current_kernel;
+}

 #endif

--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -37,11 +37,10 @@ KERNEL_STRUCT_MEMBER(path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
 /* enum PathRayMNEE */
 KERNEL_STRUCT_MEMBER(path, uint8_t, mnee, KERNEL_FEATURE_PATH_TRACING)
 /* Multiple importance sampling
- * The PDF of BSDF sampling at the last scatter point, and distance to the
- * last scatter point minus the last ray segment. This distance lets us
- * compute the complete distance through transparent surfaces and volumes. */
+ * The PDF of BSDF sampling at the last scatter point, which is at ray distance
+ * zero and distance. Note that transparency and volume attenuation increase
+ * the ray tmin but keep P unmodified so that this works. */
 KERNEL_STRUCT_MEMBER(path, float, mis_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(path, float, mis_ray_t, KERNEL_FEATURE_PATH_TRACING)
 /* Filter glossy. */
 KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
 /* Continuation probability for path termination. */
@@ -63,7 +62,8 @@ KERNEL_STRUCT_END(path)
 KERNEL_STRUCT_BEGIN(ray)
 KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, tmin, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, float, tmax, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, dD, KERNEL_FEATURE_PATH_TRACING)
--- a/intern/cycles/kernel/integrator/state_util.h
+++ b/intern/cycles/kernel/integrator/state_util.h
@@ -17,7 +17,8 @@ ccl_device_forceinline void integrator_state_write_ray(KernelGlobals kg,
 {
  INTEGRATOR_STATE_WRITE(state, ray, P) = ray->P;
  INTEGRATOR_STATE_WRITE(state, ray, D) = ray->D;
-  INTEGRATOR_STATE_WRITE(state, ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(state, ray, tmin) = ray->tmin;
+  INTEGRATOR_STATE_WRITE(state, ray, tmax) = ray->tmax;
  INTEGRATOR_STATE_WRITE(state, ray, time) = ray->time;
  INTEGRATOR_STATE_WRITE(state, ray, dP) = ray->dP;
  INTEGRATOR_STATE_WRITE(state, ray, dD) = ray->dD;
@@ -29,7 +30,8 @@ ccl_device_forceinline void integrator_state_read_ray(KernelGlobals kg,
 {
  ray->P = INTEGRATOR_STATE(state, ray, P);
  ray->D = INTEGRATOR_STATE(state, ray, D);
-  ray->t = INTEGRATOR_STATE(state, ray, t);
+  ray->tmin = INTEGRATOR_STATE(state, ray, tmin);
+  ray->tmax = INTEGRATOR_STATE(state, ray, tmax);
  ray->time = INTEGRATOR_STATE(state, ray, time);
  ray->dP = INTEGRATOR_STATE(state, ray, dP);
  ray->dD = INTEGRATOR_STATE(state, ray, dD);
@@ -42,7 +44,8 @@ ccl_device_forceinline void integrator_state_write_shadow_ray(
 {
  INTEGRATOR_STATE_WRITE(state, shadow_ray, P) = ray->P;
  INTEGRATOR_STATE_WRITE(state, shadow_ray, D) = ray->D;
-  INTEGRATOR_STATE_WRITE(state, shadow_ray, t) = ray->t;
+  INTEGRATOR_STATE_WRITE(state, shadow_ray, tmin) = ray->tmin;
+  INTEGRATOR_STATE_WRITE(state, shadow_ray, tmax) = ray->tmax;
  INTEGRATOR_STATE_WRITE(state, shadow_ray, time) = ray->time;
  INTEGRATOR_STATE_WRITE(state, shadow_ray, dP) = ray->dP;
 }
@@ -53,7 +56,8 @@ ccl_device_forceinline void integrator_state_read_shadow_ray(KernelGlobals kg,
 {
  ray->P = INTEGRATOR_STATE(state, shadow_ray, P);
  ray->D = INTEGRATOR_STATE(state, shadow_ray, D);
-  ray->t = INTEGRATOR_STATE(state, shadow_ray, t);
+  ray->tmin = INTEGRATOR_STATE(state, shadow_ray, tmin);
+  ray->tmax = INTEGRATOR_STATE(state, shadow_ray, tmax);
  ray->time = INTEGRATOR_STATE(state, shadow_ray, time);
  ray->dP = INTEGRATOR_STATE(state, shadow_ray, dP);
  ray->dD = differential_zero_compact();
--- a/intern/cycles/kernel/integrator/subsurface.h
+++ b/intern/cycles/kernel/integrator/subsurface.h
@@ -38,7 +38,8 @@ ccl_device int subsurface_bounce(KernelGlobals kg,
  /* Setup ray into surface. */
  INTEGRATOR_STATE_WRITE(state, ray, P) = sd->P;
  INTEGRATOR_STATE_WRITE(state, ray, D) = bssrdf->N;
-  INTEGRATOR_STATE_WRITE(state, ray, t) = FLT_MAX;
+  INTEGRATOR_STATE_WRITE(state, ray, tmin) = 0.0f;
+  INTEGRATOR_STATE_WRITE(state, ray, tmax) = FLT_MAX;
  INTEGRATOR_STATE_WRITE(state, ray, dP) = differential_make_compact(sd->dP);
  INTEGRATOR_STATE_WRITE(state, ray, dD) = differential_zero_compact();

@@ -160,7 +161,7 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat
  /* Pretend ray is coming from the outside towards the exit point. This ensures
   * correct front/back facing normals.
   * TODO: find a more elegant solution? */
-  ray.P += ray.D * ray.t * 2.0f;
+  ray.P += ray.D * ray.tmax * 2.0f;
  ray.D = -ray.D;

  integrator_state_write_isect(kg, state, &ss_isect.hits[0]);
@@ -177,17 +178,23 @@ ccl_device_inline bool subsurface_scatter(KernelGlobals kg, IntegratorState stat
  const bool use_raytrace_kernel = (shader_flags & SD_HAS_RAYTRACE);

  if (use_caustics) {
-    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+    integrator_path_next_sorted(kg,
+                                state,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE,
                                shader);
  }
  else if (use_raytrace_kernel) {
-    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+    integrator_path_next_sorted(kg,
+                                state,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE,
                                shader);
  }
  else {
-    INTEGRATOR_PATH_NEXT_SORTED(DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
+    integrator_path_next_sorted(kg,
+                                state,
+                                DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE,
                                DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE,
                                shader);
  }
--- a/intern/cycles/kernel/integrator/subsurface_disk.h
+++ b/intern/cycles/kernel/integrator/subsurface_disk.h
@@ -82,7 +82,8 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,
  /* Create ray. */
  ray.P = P + disk_N * disk_height + disk_P;
  ray.D = -disk_N;
-  ray.t = 2.0f * disk_height;
+  ray.tmin = 0.0f;
+  ray.tmax = 2.0f * disk_height;
  ray.dP = ray_dP;
  ray.dD = differential_zero_compact();
  ray.time = time;
@@ -188,7 +189,8 @@ ccl_device_inline bool subsurface_disk(KernelGlobals kg,

      ray.P = ray.P + ray.D * ss_isect.hits[hit].t;
      ray.D = ss_isect.Ng[hit];
-      ray.t = 1.0f;
+      ray.tmin = 0.0f;
+      ray.tmax = 1.0f;
      return true;
    }

--- a/intern/cycles/kernel/integrator/subsurface_random_walk.h
+++ b/intern/cycles/kernel/integrator/subsurface_random_walk.h
@@ -195,7 +195,8 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
  /* Setup ray. */
  ray.P = P;
  ray.D = D;
-  ray.t = FLT_MAX;
+  ray.tmin = 0.0f;
+  ray.tmax = FLT_MAX;
  ray.time = time;
  ray.dP = ray_dP;
  ray.dD = differential_zero_compact();
@@ -370,10 +371,10 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
     * chance of connecting to it.
     * TODO: Maybe use less than 10 times the mean free path? */
    if (bounce == 0) {
-      ray.t = max(t, 10.0f / (reduce_min(sigma_t)));
+      ray.tmax = max(t, 10.0f / (reduce_min(sigma_t)));
    }
    else {
-      ray.t = t;
+      ray.tmax = t;
      /* After the first bounce the object can intersect the same surface again */
      ray.self.object = OBJECT_NONE;
      ray.self.prim = PRIM_NONE;
@@ -384,12 +385,12 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
    if (hit) {
 #ifdef __KERNEL_GPU_RAYTRACING__
      /* t is always in world space with OptiX and MetalRT. */
-      ray.t = ss_isect.hits[0].t;
+      ray.tmax = ss_isect.hits[0].t;
 #else
      /* Compute world space distance to surface hit. */
      float3 D = transform_direction(&ob_itfm, ray.D);
      D = normalize(D) * ss_isect.hits[0].t;
-      ray.t = len(transform_direction(&ob_tfm, D));
+      ray.tmax = len(transform_direction(&ob_tfm, D));
 #endif
    }

@@ -397,16 +398,16 @@ ccl_device_inline bool subsurface_random_walk(KernelGlobals kg,
      /* Check if we hit the opposite side. */
      if (hit) {
        have_opposite_interface = true;
-        opposite_distance = dot(ray.P + ray.t * ray.D - P, -N);
+        opposite_distance = dot(ray.P + ray.tmax * ray.D - P, -N);
      }
      /* Apart from the opposite side check, we were supposed to only trace up to distance t,
       * so check if there would have been a hit in that case. */
-      hit = ray.t < t;
+      hit = ray.tmax < t;
    }

    /* Use the distance to the exit point for the throughput update if we found one. */
    if (hit) {
-      t = ray.t;
+      t = ray.tmax;
    }

    /* Advance to new scatter location. */
--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -270,31 +270,26 @@ ccl_device bool lights_intersect(KernelGlobals kg,

    if (type == LIGHT_SPOT) {
      /* Spot/Disk light. */
-      const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t);
-      const float3 ray_P = ray->P - ray->D * mis_ray_t;
-
      const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
      const float radius = klight->spot.radius;
      if (radius == 0.0f) {
        continue;
      }
      /* disk oriented normal */
-      const float3 lightN = normalize(ray_P - lightP);
+      const float3 lightN = normalize(ray->P - lightP);
      /* One sided. */
      if (dot(ray->D, lightN) >= 0.0f) {
        continue;
      }

      float3 P;
-      if (!ray_disk_intersect(ray->P, ray->D, ray->t, lightP, lightN, radius, &P, &t)) {
+      if (!ray_disk_intersect(
+              ray->P, ray->D, ray->tmin, ray->tmax, lightP, lightN, radius, &P, &t)) {
        continue;
      }
    }
    else if (type == LIGHT_POINT) {
      /* Sphere light (aka, aligned disk light). */
-      const float mis_ray_t = INTEGRATOR_STATE(state, path, mis_ray_t);
-      const float3 ray_P = ray->P - ray->D * mis_ray_t;
-
      const float3 lightP = make_float3(klight->co[0], klight->co[1], klight->co[2]);
      const float radius = klight->spot.radius;
      if (radius == 0.0f) {
@@ -302,9 +297,10 @@ ccl_device bool lights_intersect(KernelGlobals kg,
      }

      /* disk oriented normal */
-      const float3 lightN = normalize(ray_P - lightP);
+      const float3 lightN = normalize(ray->P - lightP);
      float3 P;
-      if (!ray_disk_intersect(ray->P, ray->D, ray->t, lightP, lightN, radius, &P, &t)) {
+      if (!ray_disk_intersect(
+              ray->P, ray->D, ray->tmin, ray->tmax, lightP, lightN, radius, &P, &t)) {
        continue;
      }
    }
@@ -330,8 +326,19 @@ ccl_device bool lights_intersect(KernelGlobals kg,
      const float3 light_P = make_float3(klight->co[0], klight->co[1], klight->co[2]);

      float3 P;
-      if (!ray_quad_intersect(
-              ray->P, ray->D, 0.0f, ray->t, light_P, axisu, axisv, Ng, &P, &t, &u, &v, is_round)) {
+      if (!ray_quad_intersect(ray->P,
+                              ray->D,
+                              ray->tmin,
+                              ray->tmax,
+                              light_P,
+                              axisu,
+                              axisv,
+                              Ng,
+                              &P,
+                              &t,
+                              &u,
+                              &v,
+                              is_round)) {
        continue;
      }
    }
@@ -775,7 +782,8 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals kg,
    ls->D = z * B + safe_sqrtf(1.0f - z * z) * safe_normalize(C_ - dot(C_, B) * B);

    /* calculate intersection with the planar triangle */
-    if (!ray_triangle_intersect(P, ls->D, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) {
+    if (!ray_triangle_intersect(
+            P, ls->D, 0.0f, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) {
      ls->pdf = 0.0f;
      return;
    }
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -227,23 +227,24 @@ ccl_device_inline void shadow_ray_setup(ccl_private const ShaderData *ccl_restri
  if (ls->shader & SHADER_CAST_SHADOW) {
    /* setup ray */
    ray->P = P;
+    ray->tmin = 0.0f;

    if (ls->t == FLT_MAX) {
      /* distant light */
      ray->D = ls->D;
-      ray->t = ls->t;
+      ray->tmax = ls->t;
    }
    else {
      /* other lights, avoid self-intersection */
      ray->D = ls->P - P;
-      ray->D = normalize_len(ray->D, &ray->t);
+      ray->D = normalize_len(ray->D, &ray->tmax);
    }
  }
  else {
    /* signal to not cast shadow ray */
    ray->P = zero_float3();
    ray->D = zero_float3();
-    ray->t = 0.0f;
+    ray->tmax = 0.0f;
  }

  ray->dP = differential_make_compact(sd->dP);
--- a/intern/cycles/kernel/osl/services.cpp
+++ b/intern/cycles/kernel/osl/services.cpp
@@ -1094,10 +1094,8 @@ bool OSLRenderServices::get_background_attribute(const KernelGlobalsCPU *kg,
      ndc[0] = camera_world_to_ndc(kg, sd, sd->ray_P);

      if (derivatives) {
-        ndc[1] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f)) -
-                 ndc[0];
-        ndc[2] = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f)) -
-                 ndc[0];
+        ndc[1] = zero_float3();
+        ndc[2] = zero_float3();
      }
    }
    else {
@@ -1671,7 +1669,8 @@ bool OSLRenderServices::trace(TraceOpt &options,

  ray.P = TO_FLOAT3(P);
  ray.D = TO_FLOAT3(R);
-  ray.t = (options.maxdist == 1.0e30f) ? FLT_MAX : options.maxdist - options.mindist;
+  ray.tmin = 0.0f;
+  ray.tmax = (options.maxdist == 1.0e30f) ? FLT_MAX : options.maxdist - options.mindist;
  ray.time = sd->time;
  ray.self.object = OBJECT_NONE;
  ray.self.prim = PRIM_NONE;
--- a/intern/cycles/kernel/svm/ao.h
+++ b/intern/cycles/kernel/svm/ao.h
@@ -59,7 +59,8 @@ ccl_device float svm_ao(
    Ray ray;
    ray.P = sd->P;
    ray.D = D.x * T + D.y * B + D.z * N;
-    ray.t = max_dist;
+    ray.tmin = 0.0f;
+    ray.tmax = max_dist;
    ray.time = sd->time;
    ray.self.object = sd->object;
    ray.self.prim = sd->prim;
--- a/intern/cycles/kernel/svm/bevel.h
+++ b/intern/cycles/kernel/svm/bevel.h
@@ -179,7 +179,8 @@ ccl_device float3 svm_bevel(
    Ray ray ccl_optional_struct_init;
    ray.P = sd->P + disk_N * disk_height + disk_P;
    ray.D = -disk_N;
-    ray.t = 2.0f * disk_height;
+    ray.tmin = 0.0f;
+    ray.tmax = 2.0f * disk_height;
    ray.dP = differential_zero_compact();
    ray.dD = differential_zero_compact();
    ray.time = sd->time;
--- a/intern/cycles/kernel/svm/node_types_template.h
+++ b/intern/cycles/kernel/svm/node_types_template.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: Apache-2.0
+ * Copyright 2011-2022 Blender Foundation */
+
+#ifndef SHADER_NODE_TYPE
+#  define SHADER_NODE_TYPE(name)
+#endif
+
+/* NOTE: for best OpenCL performance, item definition in the enum must
+ * match the switch case order in `svm.h`. */
+
+SHADER_NODE_TYPE(NODE_END)
+SHADER_NODE_TYPE(NODE_SHADER_JUMP)
+SHADER_NODE_TYPE(NODE_CLOSURE_BSDF)
+SHADER_NODE_TYPE(NODE_CLOSURE_EMISSION)
+SHADER_NODE_TYPE(NODE_CLOSURE_BACKGROUND)
+SHADER_NODE_TYPE(NODE_CLOSURE_SET_WEIGHT)
+SHADER_NODE_TYPE(NODE_CLOSURE_WEIGHT)
+SHADER_NODE_TYPE(NODE_EMISSION_WEIGHT)
+SHADER_NODE_TYPE(NODE_MIX_CLOSURE)
+SHADER_NODE_TYPE(NODE_JUMP_IF_ZERO)
+SHADER_NODE_TYPE(NODE_JUMP_IF_ONE)
+SHADER_NODE_TYPE(NODE_GEOMETRY)
+SHADER_NODE_TYPE(NODE_CONVERT)
+SHADER_NODE_TYPE(NODE_TEX_COORD)
+SHADER_NODE_TYPE(NODE_VALUE_F)
+SHADER_NODE_TYPE(NODE_VALUE_V)
+SHADER_NODE_TYPE(NODE_ATTR)
+SHADER_NODE_TYPE(NODE_VERTEX_COLOR)
+SHADER_NODE_TYPE(NODE_GEOMETRY_BUMP_DX)
+SHADER_NODE_TYPE(NODE_GEOMETRY_BUMP_DY)
+SHADER_NODE_TYPE(NODE_SET_DISPLACEMENT)
+SHADER_NODE_TYPE(NODE_DISPLACEMENT)
+SHADER_NODE_TYPE(NODE_VECTOR_DISPLACEMENT)
+SHADER_NODE_TYPE(NODE_TEX_IMAGE)
+SHADER_NODE_TYPE(NODE_TEX_IMAGE_BOX)
+SHADER_NODE_TYPE(NODE_TEX_NOISE)
+SHADER_NODE_TYPE(NODE_SET_BUMP)
+SHADER_NODE_TYPE(NODE_ATTR_BUMP_DX)
+SHADER_NODE_TYPE(NODE_ATTR_BUMP_DY)
+SHADER_NODE_TYPE(NODE_VERTEX_COLOR_BUMP_DX)
+SHADER_NODE_TYPE(NODE_VERTEX_COLOR_BUMP_DY)
+SHADER_NODE_TYPE(NODE_TEX_COORD_BUMP_DX)
+SHADER_NODE_TYPE(NODE_TEX_COORD_BUMP_DY)
+SHADER_NODE_TYPE(NODE_CLOSURE_SET_NORMAL)
+SHADER_NODE_TYPE(NODE_ENTER_BUMP_EVAL)
+SHADER_NODE_TYPE(NODE_LEAVE_BUMP_EVAL)
+SHADER_NODE_TYPE(NODE_HSV)
+SHADER_NODE_TYPE(NODE_CLOSURE_HOLDOUT)
+SHADER_NODE_TYPE(NODE_FRESNEL)
+SHADER_NODE_TYPE(NODE_LAYER_WEIGHT)
+SHADER_NODE_TYPE(NODE_CLOSURE_VOLUME)
+SHADER_NODE_TYPE(NODE_PRINCIPLED_VOLUME)
+SHADER_NODE_TYPE(NODE_MATH)
+SHADER_NODE_TYPE(NODE_VECTOR_MATH)
+SHADER_NODE_TYPE(NODE_RGB_RAMP)
+SHADER_NODE_TYPE(NODE_GAMMA)
+SHADER_NODE_TYPE(NODE_BRIGHTCONTRAST)
+SHADER_NODE_TYPE(NODE_LIGHT_PATH)
+SHADER_NODE_TYPE(NODE_OBJECT_INFO)
+SHADER_NODE_TYPE(NODE_PARTICLE_INFO)
+SHADER_NODE_TYPE(NODE_HAIR_INFO)
+SHADER_NODE_TYPE(NODE_POINT_INFO)
+SHADER_NODE_TYPE(NODE_TEXTURE_MAPPING)
+SHADER_NODE_TYPE(NODE_MAPPING)
+SHADER_NODE_TYPE(NODE_MIN_MAX)
+SHADER_NODE_TYPE(NODE_CAMERA)
+SHADER_NODE_TYPE(NODE_TEX_ENVIRONMENT)
+SHADER_NODE_TYPE(NODE_TEX_SKY)
+SHADER_NODE_TYPE(NODE_TEX_GRADIENT)
+SHADER_NODE_TYPE(NODE_TEX_VORONOI)
+SHADER_NODE_TYPE(NODE_TEX_MUSGRAVE)
+SHADER_NODE_TYPE(NODE_TEX_WAVE)
+SHADER_NODE_TYPE(NODE_TEX_MAGIC)
+SHADER_NODE_TYPE(NODE_TEX_CHECKER)
+SHADER_NODE_TYPE(NODE_TEX_BRICK)
+SHADER_NODE_TYPE(NODE_TEX_WHITE_NOISE)
+SHADER_NODE_TYPE(NODE_NORMAL)
+SHADER_NODE_TYPE(NODE_LIGHT_FALLOFF)
+SHADER_NODE_TYPE(NODE_IES)
+SHADER_NODE_TYPE(NODE_CURVES)
+SHADER_NODE_TYPE(NODE_TANGENT)
+SHADER_NODE_TYPE(NODE_NORMAL_MAP)
+SHADER_NODE_TYPE(NODE_INVERT)
+SHADER_NODE_TYPE(NODE_MIX)
+SHADER_NODE_TYPE(NODE_SEPARATE_COLOR)
+SHADER_NODE_TYPE(NODE_COMBINE_COLOR)
+SHADER_NODE_TYPE(NODE_SEPARATE_VECTOR)
+SHADER_NODE_TYPE(NODE_COMBINE_VECTOR)
+SHADER_NODE_TYPE(NODE_SEPARATE_HSV)
+SHADER_NODE_TYPE(NODE_COMBINE_HSV)
+SHADER_NODE_TYPE(NODE_VECTOR_ROTATE)
+SHADER_NODE_TYPE(NODE_VECTOR_TRANSFORM)
+SHADER_NODE_TYPE(NODE_WIREFRAME)
+SHADER_NODE_TYPE(NODE_WAVELENGTH)
+SHADER_NODE_TYPE(NODE_BLACKBODY)
+SHADER_NODE_TYPE(NODE_MAP_RANGE)
+SHADER_NODE_TYPE(NODE_VECTOR_MAP_RANGE)
+SHADER_NODE_TYPE(NODE_CLAMP)
+SHADER_NODE_TYPE(NODE_BEVEL)
+SHADER_NODE_TYPE(NODE_AMBIENT_OCCLUSION)
+SHADER_NODE_TYPE(NODE_TEX_VOXEL)
+SHADER_NODE_TYPE(NODE_AOV_START)
+SHADER_NODE_TYPE(NODE_AOV_COLOR)
+SHADER_NODE_TYPE(NODE_AOV_VALUE)
+SHADER_NODE_TYPE(NODE_FLOAT_CURVE)
+
+/* Padding for struct alignment. */
+SHADER_NODE_TYPE(NODE_PAD1)
+
+#undef SHADER_NODE_TYPE
--- a/intern/cycles/kernel/svm/svm.h
+++ b/intern/cycles/kernel/svm/svm.h
@@ -204,6 +204,15 @@ CCL_NAMESPACE_END

 CCL_NAMESPACE_BEGIN

+#ifdef __KERNEL_USE_DATA_CONSTANTS__
+#  define SVM_CASE(node) \
+    case node: \
+      if (!kernel_data_svm_usage_##node) \
+        break;
+#else
+#  define SVM_CASE(node) case node:
+#endif
+
 /* Main Interpreter Loop */
 template<uint node_feature_mask, ShaderType type, typename ConstIntegratorGenericState>
 ccl_device void svm_eval_nodes(KernelGlobals kg,
@@ -219,9 +228,10 @@ ccl_device void svm_eval_nodes(KernelGlobals kg,
    uint4 node = read_node(kg, &offset);

    switch (node.x) {
-      case NODE_END:
-        return;
-      case NODE_SHADER_JUMP: {
+      SVM_CASE(NODE_END)
+      return;
+      SVM_CASE(NODE_SHADER_JUMP)
+      {
        if (type == SHADER_TYPE_SURFACE)
          offset = node.y;
        else if (type == SHADER_TYPE_VOLUME)
@@ -232,351 +242,349 @@ ccl_device void svm_eval_nodes(KernelGlobals kg,
          return;
        break;
      }
-      case NODE_CLOSURE_BSDF:
-        offset = svm_node_closure_bsdf<node_feature_mask, type>(
-            kg, sd, stack, node, path_flag, offset);
-        break;
-      case NODE_CLOSURE_EMISSION:
-        IF_KERNEL_NODES_FEATURE(EMISSION)
-        {
-          svm_node_closure_emission(sd, stack, node);
-        }
-        break;
-      case NODE_CLOSURE_BACKGROUND:
-        IF_KERNEL_NODES_FEATURE(EMISSION)
-        {
-          svm_node_closure_background(sd, stack, node);
-        }
-        break;
-      case NODE_CLOSURE_SET_WEIGHT:
-        svm_node_closure_set_weight(sd, node.y, node.z, node.w);
-        break;
-      case NODE_CLOSURE_WEIGHT:
-        svm_node_closure_weight(sd, stack, node.y);
-        break;
-      case NODE_EMISSION_WEIGHT:
-        IF_KERNEL_NODES_FEATURE(EMISSION)
-        {
-          svm_node_emission_weight(kg, sd, stack, node);
-        }
-        break;
-      case NODE_MIX_CLOSURE:
-        svm_node_mix_closure(sd, stack, node);
-        break;
-      case NODE_JUMP_IF_ZERO:
-        if (stack_load_float(stack, node.z) <= 0.0f)
-          offset += node.y;
-        break;
-      case NODE_JUMP_IF_ONE:
-        if (stack_load_float(stack, node.z) >= 1.0f)
-          offset += node.y;
-        break;
-      case NODE_GEOMETRY:
-        svm_node_geometry(kg, sd, stack, node.y, node.z);
-        break;
-      case NODE_CONVERT:
-        svm_node_convert(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_TEX_COORD:
-        offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
-        break;
-      case NODE_VALUE_F:
-        svm_node_value_f(kg, sd, stack, node.y, node.z);
-        break;
-      case NODE_VALUE_V:
-        offset = svm_node_value_v(kg, sd, stack, node.y, offset);
-        break;
-      case NODE_ATTR:
-        svm_node_attr<node_feature_mask>(kg, sd, stack, node);
-        break;
-      case NODE_VERTEX_COLOR:
-        svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_GEOMETRY_BUMP_DX:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
-        }
-        break;
-      case NODE_GEOMETRY_BUMP_DY:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
-        }
-        break;
-      case NODE_SET_DISPLACEMENT:
-        svm_node_set_displacement<node_feature_mask>(kg, sd, stack, node.y);
-        break;
-      case NODE_DISPLACEMENT:
-        svm_node_displacement<node_feature_mask>(kg, sd, stack, node);
-        break;
-      case NODE_VECTOR_DISPLACEMENT:
-        offset = svm_node_vector_displacement<node_feature_mask>(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_IMAGE:
-        offset = svm_node_tex_image(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_IMAGE_BOX:
-        svm_node_tex_image_box(kg, sd, stack, node);
-        break;
-      case NODE_TEX_NOISE:
-        offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_SET_BUMP:
-        svm_node_set_bump<node_feature_mask>(kg, sd, stack, node);
-        break;
-      case NODE_ATTR_BUMP_DX:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_attr_bump_dx(kg, sd, stack, node);
-        }
-        break;
-      case NODE_ATTR_BUMP_DY:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_attr_bump_dy(kg, sd, stack, node);
-        }
-        break;
-      case NODE_VERTEX_COLOR_BUMP_DX:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
-        }
-        break;
-      case NODE_VERTEX_COLOR_BUMP_DY:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
-        }
-        break;
-      case NODE_TEX_COORD_BUMP_DX:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset);
-        }
-        break;
-      case NODE_TEX_COORD_BUMP_DY:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset);
-        }
-        break;
-      case NODE_CLOSURE_SET_NORMAL:
-        IF_KERNEL_NODES_FEATURE(BUMP)
-        {
-          svm_node_set_normal(kg, sd, stack, node.y, node.z);
-        }
-        break;
-      case NODE_ENTER_BUMP_EVAL:
-        IF_KERNEL_NODES_FEATURE(BUMP_STATE)
-        {
-          svm_node_enter_bump_eval(kg, sd, stack, node.y);
-        }
-        break;
-      case NODE_LEAVE_BUMP_EVAL:
-        IF_KERNEL_NODES_FEATURE(BUMP_STATE)
-        {
-          svm_node_leave_bump_eval(kg, sd, stack, node.y);
-        }
-        break;
-      case NODE_HSV:
-        svm_node_hsv(kg, sd, stack, node);
-        break;
-
-      case NODE_CLOSURE_HOLDOUT:
-        svm_node_closure_holdout(sd, stack, node);
-        break;
-      case NODE_FRESNEL:
-        svm_node_fresnel(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_LAYER_WEIGHT:
-        svm_node_layer_weight(sd, stack, node);
-        break;
-      case NODE_CLOSURE_VOLUME:
-        IF_KERNEL_NODES_FEATURE(VOLUME)
-        {
-          svm_node_closure_volume<type>(kg, sd, stack, node);
-        }
-        break;
-      case NODE_PRINCIPLED_VOLUME:
-        IF_KERNEL_NODES_FEATURE(VOLUME)
-        {
-          offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset);
-        }
-        break;
-      case NODE_MATH:
-        svm_node_math(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_VECTOR_MATH:
-        offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_RGB_RAMP:
-        offset = svm_node_rgb_ramp(kg, sd, stack, node, offset);
-        break;
-      case NODE_GAMMA:
-        svm_node_gamma(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_BRIGHTCONTRAST:
-        svm_node_brightness(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_LIGHT_PATH:
-        svm_node_light_path<node_feature_mask>(kg, state, sd, stack, node.y, node.z, path_flag);
-        break;
-      case NODE_OBJECT_INFO:
-        svm_node_object_info(kg, sd, stack, node.y, node.z);
-        break;
-      case NODE_PARTICLE_INFO:
-        svm_node_particle_info(kg, sd, stack, node.y, node.z);
-        break;
+      SVM_CASE(NODE_CLOSURE_BSDF)
+      offset = svm_node_closure_bsdf<node_feature_mask, type>(
+          kg, sd, stack, node, path_flag, offset);
+      break;
+      SVM_CASE(NODE_CLOSURE_EMISSION)
+      IF_KERNEL_NODES_FEATURE(EMISSION)
+      {
+        svm_node_closure_emission(sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_CLOSURE_BACKGROUND)
+      IF_KERNEL_NODES_FEATURE(EMISSION)
+      {
+        svm_node_closure_background(sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_CLOSURE_SET_WEIGHT)
+      svm_node_closure_set_weight(sd, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_CLOSURE_WEIGHT)
+      svm_node_closure_weight(sd, stack, node.y);
+      break;
+      SVM_CASE(NODE_EMISSION_WEIGHT)
+      IF_KERNEL_NODES_FEATURE(EMISSION)
+      {
+        svm_node_emission_weight(kg, sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_MIX_CLOSURE)
+      svm_node_mix_closure(sd, stack, node);
+      break;
+      SVM_CASE(NODE_JUMP_IF_ZERO)
+      if (stack_load_float(stack, node.z) <= 0.0f)
+        offset += node.y;
+      break;
+      SVM_CASE(NODE_JUMP_IF_ONE)
+      if (stack_load_float(stack, node.z) >= 1.0f)
+        offset += node.y;
+      break;
+      SVM_CASE(NODE_GEOMETRY)
+      svm_node_geometry(kg, sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_CONVERT)
+      svm_node_convert(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_TEX_COORD)
+      offset = svm_node_tex_coord(kg, sd, path_flag, stack, node, offset);
+      break;
+      SVM_CASE(NODE_VALUE_F)
+      svm_node_value_f(kg, sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_VALUE_V)
+      offset = svm_node_value_v(kg, sd, stack, node.y, offset);
+      break;
+      SVM_CASE(NODE_ATTR)
+      svm_node_attr<node_feature_mask>(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_VERTEX_COLOR)
+      svm_node_vertex_color(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_GEOMETRY_BUMP_DX)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_geometry_bump_dx(kg, sd, stack, node.y, node.z);
+      }
+      break;
+      SVM_CASE(NODE_GEOMETRY_BUMP_DY)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_geometry_bump_dy(kg, sd, stack, node.y, node.z);
+      }
+      break;
+      SVM_CASE(NODE_SET_DISPLACEMENT)
+      svm_node_set_displacement<node_feature_mask>(kg, sd, stack, node.y);
+      break;
+      SVM_CASE(NODE_DISPLACEMENT)
+      svm_node_displacement<node_feature_mask>(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_VECTOR_DISPLACEMENT)
+      offset = svm_node_vector_displacement<node_feature_mask>(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_IMAGE)
+      offset = svm_node_tex_image(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_IMAGE_BOX)
+      svm_node_tex_image_box(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_TEX_NOISE)
+      offset = svm_node_tex_noise(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_SET_BUMP)
+      svm_node_set_bump<node_feature_mask>(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_ATTR_BUMP_DX)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_attr_bump_dx(kg, sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_ATTR_BUMP_DY)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_attr_bump_dy(kg, sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_VERTEX_COLOR_BUMP_DX)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_vertex_color_bump_dx(kg, sd, stack, node.y, node.z, node.w);
+      }
+      break;
+      SVM_CASE(NODE_VERTEX_COLOR_BUMP_DY)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_vertex_color_bump_dy(kg, sd, stack, node.y, node.z, node.w);
+      }
+      break;
+      SVM_CASE(NODE_TEX_COORD_BUMP_DX)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        offset = svm_node_tex_coord_bump_dx(kg, sd, path_flag, stack, node, offset);
+      }
+      break;
+      SVM_CASE(NODE_TEX_COORD_BUMP_DY)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        offset = svm_node_tex_coord_bump_dy(kg, sd, path_flag, stack, node, offset);
+      }
+      break;
+      SVM_CASE(NODE_CLOSURE_SET_NORMAL)
+      IF_KERNEL_NODES_FEATURE(BUMP)
+      {
+        svm_node_set_normal(kg, sd, stack, node.y, node.z);
+      }
+      break;
+      SVM_CASE(NODE_ENTER_BUMP_EVAL)
+      IF_KERNEL_NODES_FEATURE(BUMP_STATE)
+      {
+        svm_node_enter_bump_eval(kg, sd, stack, node.y);
+      }
+      break;
+      SVM_CASE(NODE_LEAVE_BUMP_EVAL)
+      IF_KERNEL_NODES_FEATURE(BUMP_STATE)
+      {
+        svm_node_leave_bump_eval(kg, sd, stack, node.y);
+      }
+      break;
+      SVM_CASE(NODE_HSV)
+      svm_node_hsv(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_CLOSURE_HOLDOUT)
+      svm_node_closure_holdout(sd, stack, node);
+      break;
+      SVM_CASE(NODE_FRESNEL)
+      svm_node_fresnel(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_LAYER_WEIGHT)
+      svm_node_layer_weight(sd, stack, node);
+      break;
+      SVM_CASE(NODE_CLOSURE_VOLUME)
+      IF_KERNEL_NODES_FEATURE(VOLUME)
+      {
+        svm_node_closure_volume<type>(kg, sd, stack, node);
+      }
+      break;
+      SVM_CASE(NODE_PRINCIPLED_VOLUME)
+      IF_KERNEL_NODES_FEATURE(VOLUME)
+      {
+        offset = svm_node_principled_volume<type>(kg, sd, stack, node, path_flag, offset);
+      }
+      break;
+      SVM_CASE(NODE_MATH)
+      svm_node_math(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_VECTOR_MATH)
+      offset = svm_node_vector_math(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_RGB_RAMP)
+      offset = svm_node_rgb_ramp(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_GAMMA)
+      svm_node_gamma(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_BRIGHTCONTRAST)
+      svm_node_brightness(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_LIGHT_PATH)
+      svm_node_light_path<node_feature_mask>(kg, state, sd, stack, node.y, node.z, path_flag);
+      break;
+      SVM_CASE(NODE_OBJECT_INFO)
+      svm_node_object_info(kg, sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_PARTICLE_INFO)
+      svm_node_particle_info(kg, sd, stack, node.y, node.z);
+      break;
 #if defined(__HAIR__)
-      case NODE_HAIR_INFO:
-        svm_node_hair_info(kg, sd, stack, node.y, node.z);
-        break;
+      SVM_CASE(NODE_HAIR_INFO)
+      svm_node_hair_info(kg, sd, stack, node.y, node.z);
+      break;
 #endif
 #if defined(__POINTCLOUD__)
-      case NODE_POINT_INFO:
-        svm_node_point_info(kg, sd, stack, node.y, node.z);
-        break;
+      SVM_CASE(NODE_POINT_INFO)
+      svm_node_point_info(kg, sd, stack, node.y, node.z);
+      break;
 #endif
-      case NODE_TEXTURE_MAPPING:
-        offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset);
-        break;
-      case NODE_MAPPING:
-        svm_node_mapping(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_MIN_MAX:
-        offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset);
-        break;
-      case NODE_CAMERA:
-        svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_TEX_ENVIRONMENT:
-        svm_node_tex_environment(kg, sd, stack, node);
-        break;
-      case NODE_TEX_SKY:
-        offset = svm_node_tex_sky(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_GRADIENT:
-        svm_node_tex_gradient(sd, stack, node);
-        break;
-      case NODE_TEX_VORONOI:
-        offset = svm_node_tex_voronoi<node_feature_mask>(
-            kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_TEX_MUSGRAVE:
-        offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_TEX_WAVE:
-        offset = svm_node_tex_wave(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_MAGIC:
-        offset = svm_node_tex_magic(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_CHECKER:
-        svm_node_tex_checker(kg, sd, stack, node);
-        break;
-      case NODE_TEX_BRICK:
-        offset = svm_node_tex_brick(kg, sd, stack, node, offset);
-        break;
-      case NODE_TEX_WHITE_NOISE:
-        svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_NORMAL:
-        offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_LIGHT_FALLOFF:
-        svm_node_light_falloff(sd, stack, node);
-        break;
-      case NODE_IES:
-        svm_node_ies(kg, sd, stack, node);
-        break;
-      case NODE_RGB_CURVES:
-      case NODE_VECTOR_CURVES:
-        offset = svm_node_curves(kg, sd, stack, node, offset);
-        break;
-      case NODE_FLOAT_CURVE:
-        offset = svm_node_curve(kg, sd, stack, node, offset);
-        break;
-      case NODE_TANGENT:
-        svm_node_tangent(kg, sd, stack, node);
-        break;
-      case NODE_NORMAL_MAP:
-        svm_node_normal_map(kg, sd, stack, node);
-        break;
-      case NODE_INVERT:
-        svm_node_invert(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_MIX:
-        offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_SEPARATE_COLOR:
-        svm_node_separate_color(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_COMBINE_COLOR:
-        svm_node_combine_color(kg, sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_SEPARATE_VECTOR:
-        svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_COMBINE_VECTOR:
-        svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_SEPARATE_HSV:
-        offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_COMBINE_HSV:
-        offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_VECTOR_ROTATE:
-        svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
-        break;
-      case NODE_VECTOR_TRANSFORM:
-        svm_node_vector_transform(kg, sd, stack, node);
-        break;
-      case NODE_WIREFRAME:
-        svm_node_wireframe(kg, sd, stack, node);
-        break;
-      case NODE_WAVELENGTH:
-        svm_node_wavelength(kg, sd, stack, node.y, node.z);
-        break;
-      case NODE_BLACKBODY:
-        svm_node_blackbody(kg, sd, stack, node.y, node.z);
-        break;
-      case NODE_MAP_RANGE:
-        offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_VECTOR_MAP_RANGE:
-        offset = svm_node_vector_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
-      case NODE_CLAMP:
-        offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset);
-        break;
+      SVM_CASE(NODE_TEXTURE_MAPPING)
+      offset = svm_node_texture_mapping(kg, sd, stack, node.y, node.z, offset);
+      break;
+      SVM_CASE(NODE_MAPPING)
+      svm_node_mapping(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_MIN_MAX)
+      offset = svm_node_min_max(kg, sd, stack, node.y, node.z, offset);
+      break;
+      SVM_CASE(NODE_CAMERA)
+      svm_node_camera(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_TEX_ENVIRONMENT)
+      svm_node_tex_environment(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_TEX_SKY)
+      offset = svm_node_tex_sky(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_GRADIENT)
+      svm_node_tex_gradient(sd, stack, node);
+      break;
+      SVM_CASE(NODE_TEX_VORONOI)
+      offset = svm_node_tex_voronoi<node_feature_mask>(
+          kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_TEX_MUSGRAVE)
+      offset = svm_node_tex_musgrave(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_TEX_WAVE)
+      offset = svm_node_tex_wave(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_MAGIC)
+      offset = svm_node_tex_magic(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_CHECKER)
+      svm_node_tex_checker(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_TEX_BRICK)
+      offset = svm_node_tex_brick(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TEX_WHITE_NOISE)
+      svm_node_tex_white_noise(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_NORMAL)
+      offset = svm_node_normal(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_LIGHT_FALLOFF)
+      svm_node_light_falloff(sd, stack, node);
+      break;
+      SVM_CASE(NODE_IES)
+      svm_node_ies(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_CURVES)
+      offset = svm_node_curves(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_FLOAT_CURVE)
+      offset = svm_node_curve(kg, sd, stack, node, offset);
+      break;
+      SVM_CASE(NODE_TANGENT)
+      svm_node_tangent(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_NORMAL_MAP)
+      svm_node_normal_map(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_INVERT)
+      svm_node_invert(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_MIX)
+      offset = svm_node_mix(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_SEPARATE_COLOR)
+      svm_node_separate_color(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_COMBINE_COLOR)
+      svm_node_combine_color(kg, sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_SEPARATE_VECTOR)
+      svm_node_separate_vector(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_COMBINE_VECTOR)
+      svm_node_combine_vector(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_SEPARATE_HSV)
+      offset = svm_node_separate_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_COMBINE_HSV)
+      offset = svm_node_combine_hsv(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_VECTOR_ROTATE)
+      svm_node_vector_rotate(sd, stack, node.y, node.z, node.w);
+      break;
+      SVM_CASE(NODE_VECTOR_TRANSFORM)
+      svm_node_vector_transform(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_WIREFRAME)
+      svm_node_wireframe(kg, sd, stack, node);
+      break;
+      SVM_CASE(NODE_WAVELENGTH)
+      svm_node_wavelength(kg, sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_BLACKBODY)
+      svm_node_blackbody(kg, sd, stack, node.y, node.z);
+      break;
+      SVM_CASE(NODE_MAP_RANGE)
+      offset = svm_node_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_VECTOR_MAP_RANGE)
+      offset = svm_node_vector_map_range(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
+      SVM_CASE(NODE_CLAMP)
+      offset = svm_node_clamp(kg, sd, stack, node.y, node.z, node.w, offset);
+      break;
 #ifdef __SHADER_RAYTRACE__
-      case NODE_BEVEL:
-        svm_node_bevel<node_feature_mask>(kg, state, sd, stack, node);
-        break;
-      case NODE_AMBIENT_OCCLUSION:
-        svm_node_ao<node_feature_mask>(kg, state, sd, stack, node);
-        break;
+      SVM_CASE(NODE_BEVEL)
+      svm_node_bevel<node_feature_mask>(kg, state, sd, stack, node);
+      break;
+      SVM_CASE(NODE_AMBIENT_OCCLUSION)
+      svm_node_ao<node_feature_mask>(kg, state, sd, stack, node);
+      break;
 #endif

-      case NODE_TEX_VOXEL:
-        IF_KERNEL_NODES_FEATURE(VOLUME)
-        {
-          offset = svm_node_tex_voxel(kg, sd, stack, node, offset);
-        }
-        break;
-      case NODE_AOV_START:
-        if (!svm_node_aov_check(path_flag, render_buffer)) {
-          return;
-        }
-        break;
-      case NODE_AOV_COLOR:
-        svm_node_aov_color<node_feature_mask>(kg, state, sd, stack, node, render_buffer);
-        break;
-      case NODE_AOV_VALUE:
-        svm_node_aov_value<node_feature_mask>(kg, state, sd, stack, node, render_buffer);
-        break;
+      SVM_CASE(NODE_TEX_VOXEL)
+      IF_KERNEL_NODES_FEATURE(VOLUME)
+      {
+        offset = svm_node_tex_voxel(kg, sd, stack, node, offset);
+      }
+      break;
+      SVM_CASE(NODE_AOV_START)
+      if (!svm_node_aov_check(path_flag, render_buffer)) {
+        return;
+      }
+      break;
+      SVM_CASE(NODE_AOV_COLOR)
+      svm_node_aov_color<node_feature_mask>(kg, state, sd, stack, node, render_buffer);
+      break;
+      SVM_CASE(NODE_AOV_VALUE)
+      svm_node_aov_value<node_feature_mask>(kg, state, sd, stack, node, render_buffer);
+      break;
      default:
        kernel_assert(!"Unknown node type was passed to the SVM machine");
        return;
--- a/intern/cycles/kernel/svm/tex_coord.h
+++ b/intern/cycles/kernel/svm/tex_coord.h
@@ -138,7 +138,7 @@ ccl_device_noinline int svm_node_tex_coord_bump_dx(KernelGlobals kg,
    case NODE_TEXCO_WINDOW: {
      if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
          kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(sd->ray_dP, 0.0f, 0.0f));
+        data = camera_world_to_ndc(kg, sd, sd->ray_P);
      else
        data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dx);
      data.z = 0.0f;
@@ -223,7 +223,7 @@ ccl_device_noinline int svm_node_tex_coord_bump_dy(KernelGlobals kg,
    case NODE_TEXCO_WINDOW: {
      if ((path_flag & PATH_RAY_CAMERA) && sd->object == OBJECT_NONE &&
          kernel_data.cam.type == CAMERA_ORTHOGRAPHIC)
-        data = camera_world_to_ndc(kg, sd, sd->ray_P + make_float3(0.0f, sd->ray_dP, 0.0f));
+        data = camera_world_to_ndc(kg, sd, sd->ray_P);
      else
        data = camera_world_to_ndc(kg, sd, sd->P + sd->dP.dy);
      data.z = 0.0f;
--- a/intern/cycles/kernel/svm/types.h
+++ b/intern/cycles/kernel/svm/types.h
@@ -17,104 +17,9 @@ CCL_NAMESPACE_BEGIN
 /* Nodes */

 typedef enum ShaderNodeType {
-  NODE_END = 0,
-  NODE_SHADER_JUMP,
-  NODE_CLOSURE_BSDF,
-  NODE_CLOSURE_EMISSION,
-  NODE_CLOSURE_BACKGROUND,
-  NODE_CLOSURE_SET_WEIGHT,
-  NODE_CLOSURE_WEIGHT,
-  NODE_EMISSION_WEIGHT,
-  NODE_MIX_CLOSURE,
-  NODE_JUMP_IF_ZERO,
-  NODE_JUMP_IF_ONE,
-  NODE_GEOMETRY,
-  NODE_CONVERT,
-  NODE_TEX_COORD,
-  NODE_VALUE_F,
-  NODE_VALUE_V,
-  NODE_ATTR,
-  NODE_VERTEX_COLOR,
-  NODE_GEOMETRY_BUMP_DX,
-  NODE_GEOMETRY_BUMP_DY,
-  NODE_SET_DISPLACEMENT,
-  NODE_DISPLACEMENT,
-  NODE_VECTOR_DISPLACEMENT,
-  NODE_TEX_IMAGE,
-  NODE_TEX_IMAGE_BOX,
-  NODE_TEX_NOISE,
-  NODE_SET_BUMP,
-  NODE_ATTR_BUMP_DX,
-  NODE_ATTR_BUMP_DY,
-  NODE_VERTEX_COLOR_BUMP_DX,
-  NODE_VERTEX_COLOR_BUMP_DY,
-  NODE_TEX_COORD_BUMP_DX,
-  NODE_TEX_COORD_BUMP_DY,
-  NODE_CLOSURE_SET_NORMAL,
-  NODE_ENTER_BUMP_EVAL,
-  NODE_LEAVE_BUMP_EVAL,
-  NODE_HSV,
-  NODE_CLOSURE_HOLDOUT,
-  NODE_FRESNEL,
-  NODE_LAYER_WEIGHT,
-  NODE_CLOSURE_VOLUME,
-  NODE_PRINCIPLED_VOLUME,
-  NODE_MATH,
-  NODE_VECTOR_MATH,
-  NODE_RGB_RAMP,
-  NODE_GAMMA,
-  NODE_BRIGHTCONTRAST,
-  NODE_LIGHT_PATH,
-  NODE_OBJECT_INFO,
-  NODE_PARTICLE_INFO,
-  NODE_HAIR_INFO,
-  NODE_POINT_INFO,
-  NODE_TEXTURE_MAPPING,
-  NODE_MAPPING,
-  NODE_MIN_MAX,
-  NODE_CAMERA,
-  NODE_TEX_ENVIRONMENT,
-  NODE_TEX_SKY,
-  NODE_TEX_GRADIENT,
-  NODE_TEX_VORONOI,
-  NODE_TEX_MUSGRAVE,
-  NODE_TEX_WAVE,
-  NODE_TEX_MAGIC,
-  NODE_TEX_CHECKER,
-  NODE_TEX_BRICK,
-  NODE_TEX_WHITE_NOISE,
-  NODE_NORMAL,
-  NODE_LIGHT_FALLOFF,
-  NODE_IES,
-  NODE_RGB_CURVES,
-  NODE_VECTOR_CURVES,
-  NODE_TANGENT,
-  NODE_NORMAL_MAP,
-  NODE_INVERT,
-  NODE_MIX,
-  NODE_SEPARATE_COLOR,
-  NODE_COMBINE_COLOR,
-  NODE_SEPARATE_VECTOR,
-  NODE_COMBINE_VECTOR,
-  NODE_SEPARATE_HSV,
-  NODE_COMBINE_HSV,
-  NODE_VECTOR_ROTATE,
-  NODE_VECTOR_TRANSFORM,
-  NODE_WIREFRAME,
-  NODE_WAVELENGTH,
-  NODE_BLACKBODY,
-  NODE_MAP_RANGE,
-  NODE_VECTOR_MAP_RANGE,
-  NODE_CLAMP,
-  NODE_BEVEL,
-  NODE_AMBIENT_OCCLUSION,
-  NODE_TEX_VOXEL,
-  NODE_AOV_START,
-  NODE_AOV_COLOR,
-  NODE_AOV_VALUE,
-  NODE_FLOAT_CURVE,
-  /* NOTE: for best OpenCL performance, item definition in the enum must
-   * match the switch case order in `svm.h`. */
+#define SHADER_NODE_TYPE(name) name,
+#include "node_types_template.h"
+  NODE_NUM
 } ShaderNodeType;

 typedef enum NodeAttributeOutputType {
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@@ -535,7 +535,8 @@ typedef struct RaySelfPrimitives {
 typedef struct Ray {
  float3 P;   /* origin */
  float3 D;   /* direction */
-  float t;    /* length of the ray */
+  float tmin; /* start distance */
+  float tmax; /* end distance */
  float time; /* time (for motion blur) */

  RaySelfPrimitives self;
@@ -1072,94 +1073,6 @@ typedef struct KernelCamera {
 } KernelCamera;
 static_assert_align(KernelCamera, 16);

-typedef struct KernelFilm {
-  float exposure;
-  int pass_flag;
-
-  int light_pass_flag;
-  int pass_stride;
-
-  int pass_combined;
-  int pass_depth;
-  int pass_position;
-  int pass_normal;
-  int pass_roughness;
-  int pass_motion;
-
-  int pass_motion_weight;
-  int pass_uv;
-  int pass_object_id;
-  int pass_material_id;
-
-  int pass_diffuse_color;
-  int pass_glossy_color;
-  int pass_transmission_color;
-
-  int pass_diffuse_indirect;
-  int pass_glossy_indirect;
-  int pass_transmission_indirect;
-  int pass_volume_indirect;
-
-  int pass_diffuse_direct;
-  int pass_glossy_direct;
-  int pass_transmission_direct;
-  int pass_volume_direct;
-
-  int pass_emission;
-  int pass_background;
-  int pass_ao;
-  float pass_alpha_threshold;
-
-  int pass_shadow;
-  float pass_shadow_scale;
-
-  int pass_shadow_catcher;
-  int pass_shadow_catcher_sample_count;
-  int pass_shadow_catcher_matte;
-
-  int filter_table_offset;
-
-  int cryptomatte_passes;
-  int cryptomatte_depth;
-  int pass_cryptomatte;
-
-  int pass_adaptive_aux_buffer;
-  int pass_sample_count;
-
-  int pass_mist;
-  float mist_start;
-  float mist_inv_depth;
-  float mist_falloff;
-
-  int pass_denoising_normal;
-  int pass_denoising_albedo;
-  int pass_denoising_depth;
-
-  int pass_aov_color;
-  int pass_aov_value;
-  int pass_lightgroup;
-
-  /* XYZ to rendering color space transform. float4 instead of float3 to
-   * ensure consistent padding/alignment across devices. */
-  float4 xyz_to_r;
-  float4 xyz_to_g;
-  float4 xyz_to_b;
-  float4 rgb_to_y;
-  /* Rec709 to rendering color space. */
-  float4 rec709_to_r;
-  float4 rec709_to_g;
-  float4 rec709_to_b;
-  int is_rec709;
-
-  int pass_bake_primitive;
-  int pass_bake_differential;
-
-  int use_approximate_shadow_catcher;
-
-  int pad1;
-} KernelFilm;
-static_assert_align(KernelFilm, 16);
-
 typedef struct KernelFilmConvert {
  int pass_offset;
  int pass_stride;
@@ -1201,108 +1114,6 @@ typedef struct KernelFilmConvert {
 } KernelFilmConvert;
 static_assert_align(KernelFilmConvert, 16);

-typedef struct KernelBackground {
-  /* only shader index */
-  int surface_shader;
-  int volume_shader;
-  float volume_step_size;
-  int transparent;
-  float transparent_roughness_squared_threshold;
-
-  /* portal sampling */
-  float portal_weight;
-  int num_portals;
-  int portal_offset;
-
-  /* sun sampling */
-  float sun_weight;
-  /* xyz store direction, w the angle. float4 instead of float3 is used
-   * to ensure consistent padding/alignment across devices. */
-  float4 sun;
-
-  /* map sampling */
-  float map_weight;
-  int map_res_x;
-  int map_res_y;
-
-  int use_mis;
-
-  int lightgroup;
-
-  /* Padding */
-  int pad1, pad2;
-} KernelBackground;
-static_assert_align(KernelBackground, 16);
-
-typedef struct KernelIntegrator {
-  /* emission */
-  int use_direct_light;
-  int num_distribution;
-  int num_all_lights;
-  float pdf_triangles;
-  float pdf_lights;
-  float light_inv_rr_threshold;
-
-  /* bounces */
-  int min_bounce;
-  int max_bounce;
-
-  int max_diffuse_bounce;
-  int max_glossy_bounce;
-  int max_transmission_bounce;
-  int max_volume_bounce;
-
-  /* AO bounces */
-  int ao_bounces;
-  float ao_bounces_distance;
-  float ao_bounces_factor;
-  float ao_additive_factor;
-
-  /* transparent */
-  int transparent_min_bounce;
-  int transparent_max_bounce;
-  int transparent_shadows;
-
-  /* caustics */
-  int caustics_reflective;
-  int caustics_refractive;
-  float filter_glossy;
-
-  /* seed */
-  int seed;
-
-  /* clamp */
-  float sample_clamp_direct;
-  float sample_clamp_indirect;
-
-  /* mis */
-  int use_lamp_mis;
-
-  /* caustics */
-  int use_caustics;
-
-  /* sampler */
-  int sampling_pattern;
-
-  /* volume render */
-  int use_volumes;
-  int volume_max_steps;
-  float volume_step_rate;
-
-  int has_shadow_catcher;
-  float scrambling_distance;
-
-  /* Closure filter. */
-  int filter_closures;
-
-  /* MIS debugging. */
-  int direct_light_sampling_type;
-
-  /* padding */
-  int pad1;
-} KernelIntegrator;
-static_assert_align(KernelIntegrator, 16);
-
 typedef enum KernelBVHLayout {
  BVH_LAYOUT_NONE = 0,

@@ -1320,36 +1131,25 @@ typedef enum KernelBVHLayout {
  BVH_LAYOUT_ALL = BVH_LAYOUT_BVH2 | BVH_LAYOUT_EMBREE | BVH_LAYOUT_OPTIX | BVH_LAYOUT_METAL,
 } KernelBVHLayout;

-typedef struct KernelBVH {
-  /* Own BVH */
-  int root;
-  int have_motion;
-  int have_curves;
-  int bvh_layout;
-  int use_bvh_steps;
-  int curve_subdivisions;
+/* Specialized struct that can become constants in dynamic compilation. */
+#define KERNEL_STRUCT_BEGIN(name, parent) struct name {
+#define KERNEL_STRUCT_END(name) \
+  } \
+  ; \
+  static_assert_align(name, 16);

-  /* Custom BVH */
-#ifdef __KERNEL_OPTIX__
-  OptixTraversableHandle scene;
-#elif defined __METALRT__
-  metalrt_as_type scene;
+#ifdef __KERNEL_USE_DATA_CONSTANTS__
+#  define KERNEL_STRUCT_MEMBER(parent, type, name) type __unused_##name;
 #else
-#  ifdef __EMBREE__
-  RTCScene scene;
-#    ifndef __KERNEL_64_BIT__
-  int pad2;
-#    endif
-#  else
-  int scene, pad2;
-#  endif
+#  define KERNEL_STRUCT_MEMBER(parent, type, name) type name;
 #endif
-} KernelBVH;
-static_assert_align(KernelBVH, 16);
+
+#include "kernel/data_template.h"

 typedef struct KernelTables {
  int beckmann_offset;
-  int pad1, pad2, pad3;
+  int filter_table_offset;
+  int pad1, pad2;
 } KernelTables;
 static_assert_align(KernelTables, 16);

@@ -1362,18 +1162,37 @@ typedef struct KernelBake {
 static_assert_align(KernelBake, 16);

 typedef struct KernelData {
+  /* Features and limits. */
  uint kernel_features;
  uint max_closures;
  uint max_shaders;
  uint volume_stack_size;

+  /* Always dynamic data mambers. */
  KernelCamera cam;
-  KernelFilm film;
-  KernelBackground background;
-  KernelIntegrator integrator;
-  KernelBVH bvh;
-  KernelTables tables;
  KernelBake bake;
+  KernelTables tables;
+
+  /* Potentially specialized data members. */
+#define KERNEL_STRUCT_BEGIN(name, parent) name parent;
+#include "kernel/data_template.h"
+
+  /* Device specific BVH. */
+#ifdef __KERNEL_OPTIX__
+  OptixTraversableHandle device_bvh;
+#elif defined __METALRT__
+  metalrt_as_type device_bvh;
+#else
+#  ifdef __EMBREE__
+  RTCScene device_bvh;
+#    ifndef __KERNEL_64_BIT__
+  int pad1;
+#    endif
+#  else
+  int device_bvh, pad1;
+#  endif
+#endif
+  int pad2, pad3;
 } KernelData;
 static_assert_align(KernelData, 16);

--- a/intern/cycles/scene/film.cpp
+++ b/intern/cycles/scene/film.cpp
@@ -394,7 +394,7 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
  vector<float> table = filter_table(filter_type, filter_width);
  scene->lookup_tables->remove_table(&filter_table_offset_);
  filter_table_offset_ = scene->lookup_tables->add_table(dscene, table);
-  kfilm->filter_table_offset = (int)filter_table_offset_;
+  dscene->data.tables.filter_table_offset = (int)filter_table_offset_;

  /* mist pass parameters */
  kfilm->mist_start = mist_start;
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@@ -1362,7 +1362,7 @@ void GeometryManager::device_update_bvh(Device *device,
  dscene->data.bvh.use_bvh_steps = (scene->params.num_bvh_time_steps != 0);
  dscene->data.bvh.curve_subdivisions = scene->params.curve_subdivisions();
  /* The scene handle is set in 'CPUDevice::const_copy_to' and 'OptiXDevice::const_copy_to' */
-  dscene->data.bvh.scene = 0;
+  dscene->data.device_bvh = 0;
 }

 /* Set of flags used to help determining what data has been modified or needs reallocation, so we
--- a/intern/cycles/scene/scene.cpp
+++ b/intern/cycles/scene/scene.cpp
@@ -369,6 +369,8 @@ void Scene::device_update(Device *device_, Progress &progress)
    device->const_copy_to("data", &dscene.data, sizeof(dscene.data));
  }

+  device->optimize_for_scene(this);
+
  if (print_stats) {
    size_t mem_used = util_guarded_get_mem_used();
    size_t mem_peak = util_guarded_get_mem_peak();
--- a/intern/cycles/scene/scene.h
+++ b/intern/cycles/scene/scene.h
@@ -82,7 +82,7 @@ class DeviceScene {

  device_vector<uint> patches;

-  /* pointcloud */
+  /* point-cloud */
  device_vector<float4> points;
  device_vector<uint> points_shader;

@@ -124,7 +124,7 @@ class DeviceScene {
  /* integrator */
  device_vector<float> sample_pattern_lut;

-  /* ies lights */
+  /* IES lights */
  device_vector<float> ies_lights;

  KernelData data;
--- a/intern/cycles/scene/shader_nodes.cpp
+++ b/intern/cycles/scene/shader_nodes.cpp
@@ -6671,7 +6671,7 @@ void CurvesNode::compile(SVMCompiler &compiler,

  ShaderInput *fac_in = input("Fac");

-  compiler.add_node(type,
+  compiler.add_node(ShaderNodeType(type),
                    compiler.encode_uchar4(compiler.stack_assign(fac_in),
                                           compiler.stack_assign(value_in),
                                           compiler.stack_assign(value_out),
@@ -6736,7 +6736,7 @@ void RGBCurvesNode::constant_fold(const ConstantFolder &folder)

 void RGBCurvesNode::compile(SVMCompiler &compiler)
 {
-  CurvesNode::compile(compiler, NODE_RGB_CURVES, input("Color"), output("Color"));
+  CurvesNode::compile(compiler, NODE_CURVES, input("Color"), output("Color"));
 }

 void RGBCurvesNode::compile(OSLCompiler &compiler)
@@ -6774,7 +6774,7 @@ void VectorCurvesNode::constant_fold(const ConstantFolder &folder)

 void VectorCurvesNode::compile(SVMCompiler &compiler)
 {
-  CurvesNode::compile(compiler, NODE_VECTOR_CURVES, input("Vector"), output("Vector"));
+  CurvesNode::compile(compiler, NODE_CURVES, input("Vector"), output("Vector"));
 }

 void VectorCurvesNode::compile(OSLCompiler &compiler)
--- a/intern/cycles/scene/svm.cpp
+++ b/intern/cycles/scene/svm.cpp
@@ -44,8 +44,6 @@ void SVMShaderManager::device_update_shader(Scene *scene,
  }
  assert(shader->graph);

-  svm_nodes->push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
-
  SVMCompiler::Summary summary;
  SVMCompiler compiler(scene);
  compiler.background = (shader == scene->background->get_shader(scene));
@@ -170,6 +168,9 @@ SVMCompiler::SVMCompiler(Scene *scene) : scene(scene)
  background = false;
  mix_weight_offset = SVM_STACK_INVALID;
  compile_failed = false;
+
+  /* This struct has one entry for every node, in order of ShaderNodeType definition. */
+  svm_node_types_used = (std::atomic_int *)&scene->dscene.data.svm_usage;
 }

 int SVMCompiler::stack_size(SocketType::Type type)
@@ -378,11 +379,13 @@ void SVMCompiler::add_node(int a, int b, int c, int d)

 void SVMCompiler::add_node(ShaderNodeType type, int a, int b, int c)
 {
+  svm_node_types_used[type] = true;
  current_svm_nodes.push_back_slow(make_int4(type, a, b, c));
 }

 void SVMCompiler::add_node(ShaderNodeType type, const float3 &f)
 {
+  svm_node_types_used[type] = true;
  current_svm_nodes.push_back_slow(
      make_int4(type, __float_as_int(f.x), __float_as_int(f.y), __float_as_int(f.z)));
 }
@@ -663,6 +666,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *root_node,
        /* Add instruction to skip closure and its dependencies if mix
         * weight is zero.
         */
+        svm_node_types_used[NODE_JUMP_IF_ONE] = true;
        current_svm_nodes.push_back_slow(make_int4(NODE_JUMP_IF_ONE, 0, stack_assign(facin), 0));
        int node_jump_skip_index = current_svm_nodes.size() - 1;

@@ -678,6 +682,7 @@ void SVMCompiler::generate_multi_closure(ShaderNode *root_node,
        /* Add instruction to skip closure and its dependencies if mix
         * weight is zero.
         */
+        svm_node_types_used[NODE_JUMP_IF_ZERO] = true;
        current_svm_nodes.push_back_slow(make_int4(NODE_JUMP_IF_ZERO, 0, stack_assign(facin), 0));
        int node_jump_skip_index = current_svm_nodes.size() - 1;

@@ -844,6 +849,9 @@ void SVMCompiler::compile_type(Shader *shader, ShaderGraph *graph, ShaderType ty

 void SVMCompiler::compile(Shader *shader, array<int4> &svm_nodes, int index, Summary *summary)
 {
+  svm_node_types_used[NODE_SHADER_JUMP] = true;
+  svm_nodes.push_back_slow(make_int4(NODE_SHADER_JUMP, 0, 0, 0));
+
  /* copy graph for shader with bump mapping */
  ShaderNode *output = shader->graph->output();
  int start_num_svm_nodes = svm_nodes.size();
--- a/intern/cycles/scene/svm.h
+++ b/intern/cycles/scene/svm.h
@@ -211,6 +211,7 @@ class SVMCompiler {
  /* compile */
  void compile_type(Shader *shader, ShaderGraph *graph, ShaderType type);

+  std::atomic_int *svm_node_types_used;
  array<int4> current_svm_nodes;
  ShaderType current_type;
  Shader *current_shader;
--- a/intern/cycles/util/math_intersect.h
+++ b/intern/cycles/util/math_intersect.h
@@ -10,7 +10,8 @@ CCL_NAMESPACE_BEGIN

 ccl_device bool ray_sphere_intersect(float3 ray_P,
                                     float3 ray_D,
-                                     float ray_t,
+                                     float ray_tmin,
+                                     float ray_tmax,
                                     float3 sphere_P,
                                     float sphere_radius,
                                     ccl_private float3 *isect_P,
@@ -33,7 +34,7 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,
      return false;
    }
    const float t = tp - sqrtf(radiussq - dsq); /* pythagoras */
-    if (t < ray_t) {
+    if (t > ray_tmin && t < ray_tmax) {
      *isect_t = t;
      *isect_P = ray_P + ray_D * t;
      return true;
@@ -44,7 +45,8 @@ ccl_device bool ray_sphere_intersect(float3 ray_P,

 ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
                                           float3 ray_D,
-                                           float ray_t,
+                                           float ray_tmin,
+                                           float ray_tmax,
                                           float3 disk_P,
                                           float disk_radius,
                                           ccl_private float3 *isect_P,
@@ -59,7 +61,7 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,
  }
  /* Compute t to intersection point. */
  const float t = -disk_t / div;
-  if (t < 0.0f || t > ray_t) {
+  if (!(t > ray_tmin && t < ray_tmax)) {
    return false;
  }
  /* Test if within radius. */
@@ -74,7 +76,8 @@ ccl_device bool ray_aligned_disk_intersect(float3 ray_P,

 ccl_device bool ray_disk_intersect(float3 ray_P,
                                   float3 ray_D,
-                                   float ray_t,
+                                   float ray_tmin,
+                                   float ray_tmax,
                                   float3 disk_P,
                                   float3 disk_N,
                                   float disk_radius,
@@ -92,7 +95,8 @@ ccl_device bool ray_disk_intersect(float3 ray_P,
    }
    float3 P = ray_P + t * ray_D;
    float3 T = P - disk_P;
-    if (dot(T, T) < sqr(disk_radius) /*&& t > 0.f*/ && t <= ray_t) {
+
+    if (dot(T, T) < sqr(disk_radius) && (t > ray_tmin && t < ray_tmax)) {
      *isect_P = ray_P + t * ray_D;
      *isect_t = t;
      return true;
@@ -103,7 +107,8 @@ ccl_device bool ray_disk_intersect(float3 ray_P,

 ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
                                                   float3 ray_dir,
-                                                   float ray_t,
+                                                   float ray_tmin,
+                                                   float ray_tmax,
                                                   const float3 tri_a,
                                                   const float3 tri_b,
                                                   const float3 tri_c,
@@ -149,16 +154,14 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,

  /* Perform depth test. */
  const float T = dot3(v0, Ng);
-  const int sign_den = (__float_as_int(den) & 0x80000000);
-  const float sign_T = xor_signmask(T, sign_den);
-  if ((sign_T < 0.0f) || (sign_T > ray_t * xor_signmask(den, sign_den))) {
+  const float t = T / den;
+  if (!(t >= ray_tmin && t <= ray_tmax)) {
    return false;
  }

-  const float inv_den = 1.0f / den;
-  *isect_u = U * inv_den;
-  *isect_v = V * inv_den;
-  *isect_t = T * inv_den;
+  *isect_u = U / den;
+  *isect_v = V / den;
+  *isect_t = t;
  return true;

 #undef dot3
@@ -171,8 +174,8 @@ ccl_device_forceinline bool ray_triangle_intersect(float3 ray_P,
 */
 ccl_device bool ray_quad_intersect(float3 ray_P,
                                   float3 ray_D,
-                                   float ray_mint,
-                                   float ray_maxt,
+                                   float ray_tmin,
+                                   float ray_tmax,
                                   float3 quad_P,
                                   float3 quad_u,
                                   float3 quad_v,
@@ -185,7 +188,7 @@ ccl_device bool ray_quad_intersect(float3 ray_P,
 {
  /* Perform intersection test. */
  float t = -(dot(ray_P, quad_n) - dot(quad_P, quad_n)) / dot(ray_D, quad_n);
-  if (t < ray_mint || t > ray_maxt) {
+  if (!(t > ray_tmin && t < ray_tmax)) {
    return false;
  }
  const float3 hit = ray_P + t * ray_D;
--- a/intern/cycles/util/string.cpp
+++ b/intern/cycles/util/string.cpp
@@ -136,6 +136,19 @@ void string_replace(string &haystack, const string &needle, const string &other)
  }
 }

+void string_replace_same_length(string &haystack, const string &needle, const string &other)
+{
+  assert(needle.size() == other.size());
+  size_t pos = 0;
+  while (pos != string::npos) {
+    pos = haystack.find(needle, pos);
+    if (pos != string::npos) {
+      memcpy(haystack.data() + pos, other.data(), other.size());
+      pos += other.size();
+    }
+  }
+}
+
 string string_remove_trademark(const string &s)
 {
  string result = s;
@@ -164,6 +177,11 @@ string to_string(const char *str)
  return string(str);
 }

+string to_string(const float4 &v)
+{
+  return string_printf("%f,%f,%f,%f", v.x, v.y, v.z, v.w);
+}
+
 string string_to_lower(const string &s)
 {
  string r = s;
--- a/intern/cycles/util/string.h
+++ b/intern/cycles/util/string.h
@@ -38,12 +38,14 @@ void string_split(vector<string> &tokens,
                  const string &separators = "\t ",
                  bool skip_empty_tokens = true);
 void string_replace(string &haystack, const string &needle, const string &other);
+void string_replace_same_length(string &haystack, const string &needle, const string &other);
 bool string_startswith(string_view s, string_view start);
 bool string_endswith(string_view s, string_view end);
 string string_strip(const string &s);
 string string_remove_trademark(const string &s);
 string string_from_bool(const bool var);
 string to_string(const char *str);
+string to_string(const float4 &v);
 string string_to_lower(const string &s);

 /* Wide char strings are only used on Windows to deal with non-ASCII
--- a/intern/ghost/CMakeLists.txt
+++ b/intern/ghost/CMakeLists.txt
@@ -3,6 +3,7 @@

 set(INC
  .
+  ../clog
  ../glew-mx
  ../../source/blender/imbuf
  ../../source/blender/makesdna
@@ -65,6 +66,8 @@ set(SRC
  intern/GHOST_Util.h
  intern/GHOST_Window.h
  intern/GHOST_WindowManager.h
+  intern/GHOST_utildefines.h
+  intern/GHOST_utildefines_variadic.h
 )

 set(LIB
--- a/intern/ghost/GHOST_Types.h
+++ b/intern/ghost/GHOST_Types.h
@@ -547,21 +547,15 @@ typedef struct {
  /** The key code. */
  GHOST_TKey key;

-  /* ascii / utf8: both should always be set when possible,
-   * - ascii may be '\0' however if the user presses a non ascii key
-   * - unicode may not be set if the system has no unicode support
-   *
-   * These values are intended to be used as follows.
-   * For text input use unicode when available, fallback to ascii.
-   * For areas where unicode is not needed, number input for example, always
-   * use ascii, unicode is ignored - campbell.
-   */
-  /** The ascii code for the key event ('\0' if none). */
-  char ascii;
  /** The unicode character. if the length is 6, not NULL terminated if all 6 are set. */
  char utf8_buf[6];

-  /** Generated by auto-repeat. */
+  /**
+   * Enabled when the key is held (auto-repeat).
+   * In this case press events are sent without a corresponding release/up event.
+   *
+   * All back-ends must set this variable for correct behavior regarding repeatable keys.
+   */
  char is_repeat;
 } GHOST_TEventKeyData;

--- a/Show More
+++ b/Show More