Improve formating

Fix compilation & dependency depth
Make shaders sources from draw included in the dependency library.
2021-11-24 21:41:25 +01:00 · 2021-11-24 20:09:36 +01:00 · 2021-11-24 19:46:00 +01:00 · 2021-11-24 18:56:51 +01:00 · 2021-11-24 17:58:06 +01:00 · 2021-11-24 17:52:39 +01:00
762 changed files with 19511 additions and 12300 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -269,5 +269,9 @@ StatementMacros:
  - PyObject_HEAD
  - PyObject_VAR_HEAD

+StatementMacros:
+  - GPU_STAGE_INTERFACE_CREATE
+  - GPU_SHADER_DESCRIPTOR
+
 MacroBlockBegin: "^BSDF_CLOSURE_CLASS_BEGIN$"
 MacroBlockEnd: "^BSDF_CLOSURE_CLASS_END$"
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -12,6 +12,8 @@ Checks:  >
  -readability-avoid-const-params-in-decls,
  -readability-simplify-boolean-expr,
  -readability-make-member-function-const,
+  -readability-suspicious-call-argument,
+  -readability-redundant-member-init,

  -readability-misleading-indentation,

@@ -25,6 +27,8 @@ Checks:  >
  -bugprone-branch-clone,
  -bugprone-macro-parentheses,
  -bugprone-reserved-identifier,
+  -bugprone-easily-swappable-parameters,
+  -bugprone-implicit-widening-of-multiplication-result,

  -bugprone-sizeof-expression,
  -bugprone-integer-division,
@@ -40,7 +44,8 @@ Checks:  >
  -modernize-pass-by-value,
  # Cannot be enabled yet, because using raw string literals in tests breaks
  # the windows compiler currently.
-  -modernize-raw-string-literal
+  -modernize-raw-string-literal,
+  -modernize-return-braced-init-list

 CheckOptions:
  - key: modernize-use-default-member-init.UseAssignment
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -411,6 +411,7 @@ option(WITH_CYCLES                   "Enable Cycles Render Engine" ON)
 option(WITH_CYCLES_OSL               "Build Cycles with OpenShadingLanguage support" ON)
 option(WITH_CYCLES_EMBREE            "Build Cycles with Embree support" ON)
 option(WITH_CYCLES_LOGGING           "Build Cycles with logging support" ON)
+option(WITH_CYCLES_DEBUG             "Build Cycles with options useful for debugging (e.g., MIS)" OFF)

 option(WITH_CYCLES_STANDALONE        "Build Cycles standalone application" OFF)
 option(WITH_CYCLES_STANDALONE_GUI    "Build Cycles standalone with GUI" OFF)
@@ -1069,7 +1070,7 @@ if(MSVC)
  add_definitions(-D__LITTLE_ENDIAN__)

  # OSX-Note: as we do cross-compiling with specific set architecture,
-  # endianess-detection and auto-setting is counterproductive
+  # endianness-detection and auto-setting is counterproductive
  # so we just set endianness according CMAKE_OSX_ARCHITECTURES

 elseif(CMAKE_OSX_ARCHITECTURES MATCHES i386 OR CMAKE_OSX_ARCHITECTURES MATCHES x86_64 OR CMAKE_OSX_ARCHITECTURES MATCHES arm64)
@@ -1759,7 +1760,7 @@ endif()
 set(CMAKE_CXX_STANDARD 17)
 # If C++17 is not available, downgrading to an earlier standard is NOT OK.
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-# Do not enable compiler specific language extentions.
+# Do not enable compiler specific language extensions.
 set(CMAKE_CXX_EXTENSIONS OFF)

 # Make MSVC properly report the value of the __cplusplus preprocessor macro
--- a/2
+++ b/2
@@ -51,7 +51,7 @@ Other Convenience Targets
   * config:        Run cmake configuration tool to set build options.
   * deps:          Build library dependencies (intended only for platform maintainers).

-                    The existance of locally build dependancies overrides the pre-built dependencies from subversion.
+                    The existance of locally build dependencies overrides the pre-built dependencies from subversion.
                    These must be manually removed from '../lib/' to go back to using the pre-compiled libraries.

 Project Files
--- a/build_files/build_environment/cmake/harvest.cmake
+++ b/build_files/build_environment/cmake/harvest.cmake
@@ -17,7 +17,7 @@
 # ***** END GPL LICENSE BLOCK *****

 ########################################################################
-# Copy all generated files to the proper strucure as blender prefers
+# Copy all generated files to the proper structure as blender prefers
 ########################################################################

 if(NOT DEFINED HARVEST_TARGET)
--- a/build_files/build_environment/cmake/options.cmake
+++ b/build_files/build_environment/cmake/options.cmake
@@ -39,7 +39,7 @@ endif()
 set(DOWNLOAD_DIR "${CMAKE_CURRENT_BINARY_DIR}/downloads" CACHE STRING "Path for downloaded files")
 # This path must be hard-coded like this, so that the GNUmakefile knows where it is and can pass it to make_source_archive.py:
 set(PACKAGE_DIR "${CMAKE_CURRENT_BINARY_DIR}/packages")
-option(PACKAGE_USE_UPSTREAM_SOURCES "Use soures upstream to download the package sources, when OFF the blender mirror will be used" ON)
+option(PACKAGE_USE_UPSTREAM_SOURCES "Use sources upstream to download the package sources, when OFF the blender mirror will be used" ON)

 file(TO_CMAKE_PATH ${DOWNLOAD_DIR} DOWNLOAD_DIR)
 file(TO_CMAKE_PATH ${PACKAGE_DIR} PACKAGE_DIR)
--- a/build_files/build_environment/cmake/package_python.cmake
+++ b/build_files/build_environment/cmake/package_python.cmake
@@ -24,7 +24,7 @@ if(MSVC)
    add_custom_command(
      OUTPUT ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
      COMMAND echo packaging python
-      COMMAND echo this should ouput at ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
+      COMMAND echo this should output at ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
      COMMAND ${CMAKE_COMMAND} -E make_directory ${PYTARGET}/libs
      COMMAND ${CMAKE_COMMAND} -E copy ${PYSRC}/libs/python${PYTHON_SHORT_VERSION_NO_DOTS}.lib ${PYTARGET}/libs/python${PYTHON_SHORT_VERSION_NO_DOTS}.lib
      COMMAND ${CMAKE_COMMAND} -E copy ${PYSRC}/python.exe ${PYTARGET}/bin/python.exe
@@ -43,7 +43,7 @@ if(MSVC)
    add_custom_command(
      OUTPUT ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
      COMMAND echo packaging python
-      COMMAND echo this should ouput at ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
+      COMMAND echo this should output at ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
      COMMAND ${CMAKE_COMMAND} -E make_directory ${PYTARGET}/libs
      COMMAND ${CMAKE_COMMAND} -E copy ${PYSRC}/libs/python${PYTHON_SHORT_VERSION_NO_DOTS}${PYTHON_POSTFIX}.lib ${PYTARGET}/libs/python${PYTHON_SHORT_VERSION_NO_DOTS}${PYTHON_POSTFIX}.lib
      COMMAND ${CMAKE_COMMAND} -E copy ${PYSRC}/python${PYTHON_POSTFIX}.exe ${PYTARGET}/bin/python${PYTHON_POSTFIX}.exe
--- a/build_files/build_environment/install_deps.sh
+++ b/build_files/build_environment/install_deps.sh
@@ -1826,7 +1826,7 @@ compile_OCIO() {
    # Force linking against static libs
    #rm -f $_inst/lib/*.so*

-    # Additional depencencies
+    # Additional dependencies
    #cp ext/dist/lib/libtinyxml.a $_inst/lib
    #cp ext/dist/lib/libyaml-cpp.a $_inst/lib

--- a/build_files/cmake/cmake_netbeans_project.py
+++ b/build_files/cmake/cmake_netbeans_project.py
@@ -180,7 +180,7 @@ def create_nb_project_main():
        f.write('    </logicalFolder>\n')

        f.write('  </logicalFolder>\n')
-        # default, but this dir is infact not in blender dir so we can ignore it
+        # default, but this dir is in fact not in blender dir so we can ignore it
        # f.write('  <sourceFolderFilter>^(nbproject)$</sourceFolderFilter>\n')
        f.write(r'  <sourceFolderFilter>^(nbproject|__pycache__|.*\.py|.*\.html|.*\.blend)$</sourceFolderFilter>\n')

--- a/build_files/cmake/macros.cmake
+++ b/build_files/cmake/macros.cmake
@@ -529,7 +529,7 @@ function(SETUP_LIBDIRS)

  # NOTE: For all new libraries, use absolute library paths.
  # This should eventually be phased out.
-  # APPLE plaform uses full paths for linking libraries, and avoids link_directories.
+  # APPLE platform uses full paths for linking libraries, and avoids link_directories.
  if(NOT MSVC AND NOT APPLE)
    link_directories(${JPEG_LIBPATH} ${PNG_LIBPATH} ${ZLIB_LIBPATH} ${FREETYPE_LIBPATH})

--- a/build_files/cmake/platform/platform_win32_bundle_crt.cmake
+++ b/build_files/cmake/platform/platform_win32_bundle_crt.cmake
@@ -27,7 +27,7 @@ if(WITH_WINDOWS_BUNDLE_CRT)
  # Install the CRT to the blender.crt Sub folder.
  install(FILES ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS} DESTINATION ./blender.crt COMPONENT Libraries)

-  # Generating the manifest is a relativly expensive operation since
+  # Generating the manifest is a relatively expensive operation since
  # it is collecting an sha1 hash for every file required. so only do
  # this work when the libs have either changed or the manifest does
  # not exist yet.
--- a/build_files/config/pipeline_config.yaml
+++ b/build_files/config/pipeline_config.yaml
@@ -5,38 +5,38 @@
 update-code:
    git:
        submodules:
-        -   branch: blender-v3.0-release
+        -   branch: master
            commit_id: HEAD
            path: release/scripts/addons
-        -   branch: blender-v3.0-release
+        -   branch: master
            commit_id: HEAD
            path: release/scripts/addons_contrib
-        -   branch: blender-v3.0-release
+        -   branch: master
            commit_id: HEAD
            path: release/datafiles/locale
-        -   branch: blender-v3.0-release
+        -   branch: master
            commit_id: HEAD
            path: source/tools
    svn:
        libraries:
            darwin-arm64:
-                branch: tags/blender-3.0-release
+                branch: trunk
                commit_id: HEAD
                path: lib/darwin_arm64
            darwin-x86_64:
-                branch: tags/blender-3.0-release
+                branch: trunk
                commit_id: HEAD
                path: lib/darwin
            linux-x86_64:
-                branch: tags/blender-3.0-release
+                branch: trunk
                commit_id: HEAD
                path: lib/linux_centos7_x86_64
            windows-amd64:
-                branch: tags/blender-3.0-release
+                branch: trunk
                commit_id: HEAD
                path: lib/win64_vc15
        tests:
-            branch: tags/blender-3.0-release
+            branch: trunk
            commit_id: HEAD
            path: lib/tests
        benchmarks:
--- a/doc/doxygen/Doxyfile
+++ b/doc/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = Blender
 # could be handy for archiving the generated documentation or if some version
 # control system is used.

-PROJECT_NUMBER         = V3.0
+PROJECT_NUMBER         = V3.1

 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
--- a/doc/python_api/examples/bpy.app.timers.5.py
+++ b/doc/python_api/examples/bpy.app.timers.5.py
@@ -11,7 +11,7 @@ import queue

 execution_queue = queue.Queue()

-# This function can savely be called in another thread.
+# This function can safely be called in another thread.
 # The function will be executed when the timer runs the next time.
 def run_in_main_thread(function):
    execution_queue.put(function)
--- a/doc/python_api/rst/info_gotcha.rst
+++ b/doc/python_api/rst/info_gotcha.rst
@@ -728,7 +728,7 @@ Abusing RNA property callbacks
 ------------------------------

 Python-defined RNA properties can have custom callbacks. Trying to perform complex operations
-from there, like calling an operator, may work, but is not officialy recommended nor supported.
+from there, like calling an operator, may work, but is not officially recommended nor supported.

 Main reason is that those callback should be very fast, but additionally, it may for example
 create issues with undo/redo system (most operators store an history step, and editing an RNA
--- a/doc/python_api/sphinx_doc_gen.py
+++ b/doc/python_api/sphinx_doc_gen.py
@@ -1224,7 +1224,10 @@ def pycontext2sphinx(basepath):
        while char_array[i] is not None:
            member = ctypes.string_at(char_array[i]).decode(encoding="ascii")
            fw(".. data:: %s\n\n" % member)
-            member_type, is_seq = context_type_map[member]
+            try:
+                member_type, is_seq = context_type_map[member]
+            except KeyError:
+                raise SystemExit("Error: context key %r not found in context_type_map; update %s" % (member, __file__)) from None
            fw("   :type: %s :class:`bpy.types.%s`\n\n" % ("sequence of " if is_seq else "", member_type))
            unique.add(member)
            i += 1
@@ -2251,7 +2254,7 @@ def main():
    # First monkey patch to load in fake members.
    setup_monkey_patch()

-    # Perform changes to Blender it's self.
+    # Perform changes to Blender itself.
    setup_data = setup_blender()

    # eventually, create the dirs
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -226,6 +226,9 @@ add_definitions(
  -DCCL_NAMESPACE_END=}
 )

+if(WITH_CYCLES_DEBUG)
+  add_definitions(-DWITH_CYCLES_DEBUG)
+endif()
 if(WITH_CYCLES_STANDALONE_GUI)
  add_definitions(-DWITH_CYCLES_STANDALONE_GUI)
 endif()
@@ -334,7 +337,7 @@ else()
 endif()

 # Warnings
-if(CMAKE_COMPILER_IS_GNUCXX)
+if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_C_COMPILER_ID MATCHES "Clang")
  ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_float_conversion "-Werror=float-conversion")
  ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_cxxflag_double_promotion "-Werror=double-promotion")
  ADD_CHECK_CXX_COMPILER_FLAG(CMAKE_CXX_FLAGS _has_no_error_unused_macros "-Wno-error=unused-macros")
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -138,11 +138,6 @@ endif()

 blender_add_lib(bf_intern_cycles "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")

-# avoid link failure with clang 3.4 debug
-if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS '3.4')
-  string(APPEND CMAKE_CXX_FLAGS_DEBUG " -gline-tables-only")
-endif()
-
 add_dependencies(bf_intern_cycles bf_rna)

 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${ADDON_FILES}" ${CYCLES_INSTALL_PATH})
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -218,6 +218,12 @@ enum_denoising_prefilter = (
    ('ACCURATE', "Accurate", "Prefilter noisy guiding passes before denoising color. Improves quality when guiding passes are noisy using extra processing time", 3),
 )

+enum_direct_light_sampling_type = (
+    ('MULTIPLE_IMPORTANCE_SAMPLING', "Multiple Importance Sampling", "Multiple importance sampling is used to combine direct light contributions from next-event estimation and forward path tracing", 0),
+    ('FORWARD_PATH_TRACING', "Forward Path Tracing", "Direct light contributions are only sampled using forward path tracing", 1),
+    ('NEXT_EVENT_ESTIMATION', "Next-Event Estimation", "Direct light contributions are only sampled using next-event estimation", 2),
+)
+
 def update_render_passes(self, context):
    scene = context.scene
    view_layer = context.view_layer
@@ -325,6 +331,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        default=1024,
    )

+    sample_offset: IntProperty(
+        name="Sample Offset",
+        description="Number of samples to skip when starting render",
+        min=0, max=(1 << 24),
+        default=0,
+    )
+
    time_limit: FloatProperty(
        name="Time Limit",
        description="Limit the render time (excluding synchronization time)."
@@ -415,6 +428,13 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        default=0,
    )

+    direct_light_sampling_type: EnumProperty(
+        name="Direct Light Sampling Type",
+        description="The type of strategy used for sampling direct light contributions",
+        items=enum_direct_light_sampling_type,
+        default='MULTIPLE_IMPORTANCE_SAMPLING',
+    )
+
    min_light_bounces: IntProperty(
        name="Min Light Bounces",
        description="Minimum number of light bounces. Setting this higher reduces noise in the first bounces, "
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -290,6 +290,9 @@ class CYCLES_RENDER_PT_sampling_advanced(CyclesButtonsPanel, Panel):
        col.active = not (cscene.use_adaptive_sampling and cscene.use_preview_adaptive_sampling)
        col.prop(cscene, "sampling_pattern", text="Pattern")

+        col = layout.column(align=True)
+        col.prop(cscene, "sample_offset")
+
        layout.separator()

        heading = layout.column(align=True, heading="Scrambling Distance")
--- a/intern/cycles/blender/curves.cpp
+++ b/intern/cycles/blender/curves.cpp
@@ -199,7 +199,7 @@ static bool ObtainCacheParticleUV(Hair *hair,
          b_mesh->uv_layers.begin(l);

          float2 uv = zero_float2();
-          if (b_mesh->uv_layers.length())
+          if (!b_mesh->uv_layers.empty())
            b_psys.uv_on_emitter(psmd, *b_pa, pa_no, uv_num, &uv.x);
          CData->curve_uv.push_back_slow(uv);

@@ -261,7 +261,7 @@ static bool ObtainCacheParticleVcol(Hair *hair,
          b_mesh->vertex_colors.begin(l);

          float4 vcol = make_float4(0.0f, 0.0f, 0.0f, 1.0f);
-          if (b_mesh->vertex_colors.length())
+          if (!b_mesh->vertex_colors.empty())
            b_psys.mcol_on_emitter(psmd, *b_pa, pa_no, vcol_num, &vcol.x);
          CData->curve_vcol.push_back_slow(vcol);

--- a/intern/cycles/blender/display_driver.cpp
+++ b/intern/cycles/blender/display_driver.cpp
@@ -334,7 +334,7 @@ bool BlenderDisplayDriver::update_begin(const Params &params,

  /* Update PBO dimensions if needed.
   *
-   * NOTE: Allocate the PBO for the the size which will fit the final render resolution (as in,
+   * NOTE: Allocate the PBO for the size which will fit the final render resolution (as in,
   * at a resolution divider 1. This was we don't need to recreate graphics interoperability
   * objects which are costly and which are tied to the specific underlying buffer size.
   * The downside of this approach is that when graphics interoperability is not used we are
--- a/intern/cycles/blender/mesh.cpp
+++ b/intern/cycles/blender/mesh.cpp
@@ -555,7 +555,7 @@ static void attr_create_vertex_color(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh,
 /* Create uv map attributes. */
 static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh)
 {
-  if (b_mesh.uv_layers.length() != 0) {
+  if (!b_mesh.uv_layers.empty()) {
    for (BL::MeshUVLoopLayer &l : b_mesh.uv_layers) {
      const bool active_render = l.active_render();
      AttributeStandard uv_std = (active_render) ? ATTR_STD_UV : ATTR_STD_NONE;
@@ -619,7 +619,7 @@ static void attr_create_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh)

 static void attr_create_subd_uv_map(Scene *scene, Mesh *mesh, BL::Mesh &b_mesh, bool subdivide_uvs)
 {
-  if (b_mesh.uv_layers.length() != 0) {
+  if (!b_mesh.uv_layers.empty()) {
    BL::Mesh::uv_layers_iterator l;
    int i = 0;

@@ -951,7 +951,7 @@ static void create_mesh(Scene *scene,
  N = attr_N->data_float3();

  /* create generated coordinates from undeformed coordinates */
-  const bool need_default_tangent = (subdivision == false) && (b_mesh.uv_layers.length() == 0) &&
+  const bool need_default_tangent = (subdivision == false) && (b_mesh.uv_layers.empty()) &&
                                    (mesh->need_attribute(scene, ATTR_STD_UV_TANGENT));
  if (mesh->need_attribute(scene, ATTR_STD_GENERATED) || need_default_tangent) {
    Attribute *attr = attributes.add(ATTR_STD_GENERATED);
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@@ -392,6 +392,12 @@ void BlenderSync::sync_integrator(BL::ViewLayer &b_view_layer, bool background)
    integrator->set_ao_bounces(0);
  }

+#ifdef WITH_CYCLES_DEBUG
+  DirectLightSamplingType direct_light_sampling_type = (DirectLightSamplingType)get_enum(
+      cscene, "direct_light_sampling_type", DIRECT_LIGHT_SAMPLING_NUM, DIRECT_LIGHT_SAMPLING_MIS);
+  integrator->set_direct_light_sampling_type(direct_light_sampling_type);
+#endif
+
  const DenoiseParams denoise_params = get_denoise_params(b_scene, b_view_layer, background);
  integrator->set_use_denoise(denoise_params.use);

@@ -835,18 +841,25 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
  /* samples */
  int samples = get_int(cscene, "samples");
  int preview_samples = get_int(cscene, "preview_samples");
+  int sample_offset = get_int(cscene, "sample_offset");

  if (background) {
    params.samples = samples;
+    params.sample_offset = sample_offset;
  }
  else {
    params.samples = preview_samples;
-    if (params.samples == 0)
+    if (params.samples == 0) {
      params.samples = INT_MAX;
+    }
+    params.sample_offset = 0;
  }

+  /* Clamp sample offset. */
+  params.sample_offset = clamp(params.sample_offset, 0, Integrator::MAX_SAMPLES);
+
  /* Clamp samples. */
-  params.samples = min(params.samples, Integrator::MAX_SAMPLES);
+  params.samples = clamp(params.samples, 0, Integrator::MAX_SAMPLES - params.sample_offset);

  /* Viewport Performance */
  params.pixel_size = b_engine.get_preview_pixel_size(b_scene);
@@ -865,7 +878,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,

  /* Time limit. */
  if (background) {
-    params.time_limit = get_float(cscene, "time_limit");
+    params.time_limit = (double)get_float(cscene, "time_limit");
  }
  else {
    /* For the viewport it kind of makes more sense to think in terms of the noise floor, which is
--- a/intern/cycles/blender/util.h
+++ b/intern/cycles/blender/util.h
@@ -303,7 +303,7 @@ static inline string image_user_file_path(BL::ImageUser &iuser,
  string filepath_str = string(filepath);
  if (load_tiled && ima.source() == BL::Image::source_TILED) {
    string udim;
-    if (ima.tiles.length() > 0) {
+    if (!ima.tiles.empty()) {
      udim = to_string(ima.tiles[0].number());
    }
    string_replace(filepath_str, udim, "<UDIM>");
@@ -647,7 +647,7 @@ static inline Mesh::SubdivisionType object_subdivision_type(BL::Object &b_ob,
 {
  PointerRNA cobj = RNA_pointer_get(&b_ob.ptr, "cycles");

-  if (cobj.data && b_ob.modifiers.length() > 0 && experimental) {
+  if (cobj.data && !b_ob.modifiers.empty() && experimental) {
    BL::Modifier mod = b_ob.modifiers[b_ob.modifiers.length() - 1];
    bool enabled = preview ? mod.show_viewport() : mod.show_render();

--- a/intern/cycles/bvh/embree.cpp
+++ b/intern/cycles/bvh/embree.cpp
@@ -303,7 +303,7 @@ static void rtc_error_func(void *, enum RTCError, const char *str)
  VLOG(1) << str;
 }

-static double progress_start_time = 0.0f;
+static double progress_start_time = 0.0;

 static bool rtc_progress_func(void *user_ptr, const double n)
 {
--- a/intern/cycles/bvh/node.cpp
+++ b/intern/cycles/bvh/node.cpp
@@ -153,7 +153,7 @@ void BVHNode::update_time()
 namespace {

 struct DumpTraversalContext {
-  /* Descriptor of wile where writing is happening. */
+  /* Descriptor of while where writing is happening. */
  FILE *stream;
  /* Unique identifier of the node current. */
  int id;
--- a/intern/cycles/bvh/node.h
+++ b/intern/cycles/bvh/node.h
@@ -178,7 +178,7 @@ class InnerNode : public BVHNode {
    reset_unused_children();
  }

-  /* NOTE: This function is only used during binary BVH builder, and it
+  /* NOTE: This function is only used during binary BVH builder, and it's
   * supposed to be configured to have 2 children which will be filled-in in a
   * bit. But this is important to have children reset to NULL. */
  explicit InnerNode(const BoundBox &bounds) : BVHNode(bounds), num_children_(0)
--- a/intern/cycles/cmake/macros.cmake
+++ b/intern/cycles/cmake/macros.cmake
@@ -88,7 +88,7 @@ endmacro()

 function(cycles_link_directories)
  if(APPLE)
-    # APPLE plaform uses full paths for linking libraries, and avoids link_directories.
+    # APPLE platform uses full paths for linking libraries, and avoids link_directories.
    return()
  endif()

--- a/intern/cycles/device/cpu/device.cpp
+++ b/intern/cycles/device/cpu/device.cpp
@@ -38,7 +38,6 @@ void device_cpu_info(vector<DeviceInfo> &devices)
  info.id = "CPU";
  info.num = 0;
  info.has_osl = true;
-  info.has_half_images = true;
  info.has_nanovdb = true;
  info.has_profiling = true;
  if (openimagedenoise_supported()) {
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -93,11 +93,6 @@ CPUDevice::~CPUDevice()
  texture_info.free();
 }

-bool CPUDevice::show_samples() const
-{
-  return (info.cpu_threads == 1);
-}
-
 BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
 {
  BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -60,8 +60,6 @@ class CPUDevice : public Device {
  CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
  ~CPUDevice();

-  virtual bool show_samples() const override;
-
  virtual BVHLayoutMask get_bvh_layout_mask() const override;

  /* Returns true if the texture info was copied to the device (meaning, some more
--- a/intern/cycles/device/cuda/device.cpp
+++ b/intern/cycles/device/cuda/device.cpp
@@ -144,7 +144,6 @@ void device_cuda_info(vector<DeviceInfo> &devices)
    info.description = string(name);
    info.num = num;

-    info.has_half_images = (major >= 3);
    info.has_nanovdb = true;
    info.denoisers = 0;

--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -46,12 +46,6 @@ bool CUDADevice::have_precompiled_kernels()
  return path_exists(cubins_path);
 }

-bool CUDADevice::show_samples() const
-{
-  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
 BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
 {
  return BVH_LAYOUT_BVH2;
@@ -242,6 +236,10 @@ string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
  cflags += " -DWITH_NANOVDB";
 #  endif

+#  ifdef WITH_CYCLES_DEBUG
+  cflags += " -DWITH_CYCLES_DEBUG";
+#  endif
+
  return cflags;
 }

@@ -932,7 +930,6 @@ void CUDADevice::tex_alloc(device_texture &mem)
 {
  CUDAContextScope scope(this);

-  /* General variables for both architectures */
  string bind_name = mem.name;
  size_t dsize = datatype_size(mem.data_type);
  size_t size = mem.memory_size();
@@ -1095,7 +1092,6 @@ void CUDADevice::tex_alloc(device_texture &mem)

  if (mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT &&
      mem.info.data_type != IMAGE_DATA_TYPE_NANOVDB_FLOAT3) {
-    /* Kepler+, bindless textures. */
    CUDA_RESOURCE_DESC resDesc;
    memset(&resDesc, 0, sizeof(resDesc));

--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -76,8 +76,6 @@ class CUDADevice : public Device {

  static bool have_precompiled_kernels();

-  virtual bool show_samples() const override;
-
  virtual BVHLayoutMask get_bvh_layout_mask() const override;

  void set_error(const string &error) override;
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -286,7 +286,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
  info.description = "Multi Device";
  info.num = 0;

-  info.has_half_images = true;
  info.has_nanovdb = true;
  info.has_osl = true;
  info.has_profiling = true;
@@ -333,7 +332,6 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
    }

    /* Accumulate device info. */
-    info.has_half_images &= device.has_half_images;
    info.has_nanovdb &= device.has_nanovdb;
    info.has_osl &= device.has_osl;
    info.has_profiling &= device.has_profiling;
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -73,7 +73,6 @@ class DeviceInfo {
  int num;
  bool display_device;        /* GPU is used as a display device. */
  bool has_nanovdb;           /* Support NanoVDB volumes. */
-  bool has_half_images;       /* Support half-float textures. */
  bool has_osl;               /* Support Open Shading Language. */
  bool has_profiling;         /* Supports runtime collection of profiling info. */
  bool has_peer_memory;       /* GPU has P2P access to memory of another GPU. */
@@ -90,7 +89,6 @@ class DeviceInfo {
    num = 0;
    cpu_threads = 0;
    display_device = false;
-    has_half_images = false;
    has_nanovdb = false;
    has_osl = false;
    has_profiling = false;
@@ -151,10 +149,6 @@ class Device {
    fprintf(stderr, "%s\n", error.c_str());
    fflush(stderr);
  }
-  virtual bool show_samples() const
-  {
-    return false;
-  }
  virtual BVHLayoutMask get_bvh_layout_mask() const = 0;

  /* statistics */
--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@@ -148,7 +148,6 @@ void device_hip_info(vector<DeviceInfo> &devices)
    info.description = string(name);
    info.num = num;

-    info.has_half_images = true;
    info.has_nanovdb = true;
    info.denoisers = 0;

--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
@@ -47,12 +47,6 @@ bool HIPDevice::have_precompiled_kernels()
  return path_exists(fatbins_path);
 }

-bool HIPDevice::show_samples() const
-{
-  /* The HIPDevice only processes one tile at a time, so showing samples is fine. */
-  return true;
-}
-
 BVHLayoutMask HIPDevice::get_bvh_layout_mask() const
 {
  return BVH_LAYOUT_BVH2;
@@ -243,7 +237,7 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c
  hipGetDeviceProperties(&props, hipDevId);

  /* gcnArchName can contain tokens after the arch name with features, ie.
-    "gfx1010:sramecc-:xnack-" so we tokenize it to get the first part. */
+   * `gfx1010:sramecc-:xnack-` so we tokenize it to get the first part. */
  char *arch = strtok(props.gcnArchName, ":");
  if (arch == NULL) {
    arch = props.gcnArchName;
@@ -374,10 +368,9 @@ string HIPDevice::compile_kernel(const uint kernel_features, const char *name, c

 bool HIPDevice::load_kernels(const uint kernel_features)
 {
-  /* TODO(sergey): Support kernels re-load for CUDA devices adaptive compile.
+  /* TODO(sergey): Support kernels re-load for HIP devices adaptive compile.
   *
-   * Currently re-loading kernel will invalidate memory pointers,
-   * causing problems in cuCtxSynchronize.
+   * Currently re-loading kernels will invalidate memory pointers.
   */
  if (hipModule) {
    if (use_adaptive_compilation()) {
@@ -899,7 +892,6 @@ void HIPDevice::tex_alloc(device_texture &mem)
 {
  HIPContextScope scope(this);

-  /* General variables for both architectures */
  string bind_name = mem.name;
  size_t dsize = datatype_size(mem.data_type);
  size_t size = mem.memory_size();
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -75,8 +75,6 @@ class HIPDevice : public Device {

  static bool have_precompiled_kernels();

-  virtual bool show_samples() const override;
-
  virtual BVHLayoutMask get_bvh_layout_mask() const override;

  void set_error(const string &error) override;
@@ -93,9 +91,7 @@ class HIPDevice : public Device {

  virtual string compile_kernel_get_common_cflags(const uint kernel_features);

-  string compile_kernel(const uint kernel_features,
-                        const char *name,
-                        const char *base = "hip");
+  string compile_kernel(const uint kernel_features, const char *name, const char *base = "hip");

  virtual bool load_kernels(const uint kernel_features) override;
  void reserve_local_memory(const uint kernel_features);
--- a/intern/cycles/device/hip/graphics_interop.h
+++ b/intern/cycles/device/hip/graphics_interop.h
@@ -48,7 +48,7 @@ class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop {
  HIPDeviceQueue *queue_ = nullptr;
  HIPDevice *device_ = nullptr;

-  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  /* OpenGL PBO which is currently registered as the destination for the HIP buffer. */
  uint opengl_pbo_id_ = 0;
  /* Buffer area in pixels of the corresponding PBO. */
  int64_t buffer_area_ = 0;
--- a/intern/cycles/device/memory.cpp
+++ b/intern/cycles/device/memory.cpp
@@ -23,7 +23,7 @@ CCL_NAMESPACE_BEGIN

 device_memory::device_memory(Device *device, const char *name, MemoryType type)
    : data_type(device_type_traits<uchar>::data_type),
-      data_elements(device_type_traits<uchar>::num_elements_cpu),
+      data_elements(device_type_traits<uchar>::num_elements),
      data_size(0),
      device_size(0),
      data_width(0),
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@@ -81,155 +81,140 @@ static constexpr size_t datatype_size(DataType datatype)

 template<typename T> struct device_type_traits {
  static const DataType data_type = TYPE_UNKNOWN;
-  static const size_t num_elements_cpu = sizeof(T);
-  static const size_t num_elements_gpu = sizeof(T);
+  static const size_t num_elements = sizeof(T);
 };

 template<> struct device_type_traits<uchar> {
  static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uchar) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uchar2> {
  static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(uchar2) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uchar3> {
  static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 3;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 3;
+  static_assert(sizeof(uchar3) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uchar4> {
  static const DataType data_type = TYPE_UCHAR;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(uchar4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint> {
  static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uint) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint2> {
  static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(uint2) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint3> {
  static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 3;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 3;
+  static_assert(sizeof(uint3) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint4> {
  static const DataType data_type = TYPE_UINT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(uint4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<int> {
  static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(int) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<int2> {
  static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(int2) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<int3> {
  static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(int3) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<int4> {
  static const DataType data_type = TYPE_INT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(int4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<float> {
  static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(float) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<float2> {
  static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 2;
-  static const size_t num_elements_gpu = 2;
-  static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 2;
+  static_assert(sizeof(float2) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<float3> {
+  /* float3 has different size depending on the device, can't use it for interchanging
+   * memory between CPU and GPU.
+   *
+   * Leave body empty to trigger a compile error if used. */
+};
+
+template<> struct device_type_traits<packed_float3> {
  static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 3;
-  static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 3;
+  static_assert(sizeof(packed_float3) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<float4> {
  static const DataType data_type = TYPE_FLOAT;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(float4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<half> {
  static const DataType data_type = TYPE_HALF;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(half) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<ushort4> {
  static const DataType data_type = TYPE_UINT16;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(ushort4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint16_t> {
  static const DataType data_type = TYPE_UINT16;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uint16_t) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<half4> {
  static const DataType data_type = TYPE_HALF;
-  static const size_t num_elements_cpu = 4;
-  static const size_t num_elements_gpu = 4;
-  static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 4;
+  static_assert(sizeof(half4) == num_elements * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint64_t> {
  static const DataType data_type = TYPE_UINT64;
-  static const size_t num_elements_cpu = 1;
-  static const size_t num_elements_gpu = 1;
-  static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
+  static const size_t num_elements = 1;
+  static_assert(sizeof(uint64_t) == num_elements * datatype_size(data_type));
 };

 /* Device Memory
@@ -325,9 +310,7 @@ template<typename T> class device_only_memory : public device_memory {
      : device_memory(device, name, allow_host_memory_fallback ? MEM_READ_WRITE : MEM_DEVICE_ONLY)
  {
    data_type = device_type_traits<T>::data_type;
-    data_elements = max(device_is_cpu() ? device_type_traits<T>::num_elements_cpu :
-                                          device_type_traits<T>::num_elements_gpu,
-                        1);
+    data_elements = max(device_type_traits<T>::num_elements, 1);
  }

  device_only_memory(device_only_memory &&other) noexcept : device_memory(std::move(other))
@@ -383,15 +366,11 @@ template<typename T> class device_only_memory : public device_memory {

 template<typename T> class device_vector : public device_memory {
 public:
-  /* Can only use this for types that have the same size on CPU and GPU. */
-  static_assert(device_type_traits<T>::num_elements_cpu ==
-                device_type_traits<T>::num_elements_gpu);
-
  device_vector(Device *device, const char *name, MemoryType type)
      : device_memory(device, name, type)
  {
    data_type = device_type_traits<T>::data_type;
-    data_elements = device_type_traits<T>::num_elements_cpu;
+    data_elements = device_type_traits<T>::num_elements;
    modified = true;
    need_realloc_ = true;

--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -109,14 +109,6 @@ class MultiDevice : public Device {
    return error_msg;
  }

-  virtual bool show_samples() const override
-  {
-    if (devices.size() > 1) {
-      return false;
-    }
-    return devices.front().device->show_samples();
-  }
-
  virtual BVHLayoutMask get_bvh_layout_mask() const override
  {
    BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
--- a/intern/cycles/doc/license/readme.txt
+++ b/intern/cycles/doc/license/readme.txt
@@ -3,7 +3,7 @@ This program uses code from various sources, the default license is Apache 2.0
 for all code, with the following exceptions.

 Modified BSD License
-* Code adapated from Open Shading Language
+* Code adapted from Open Shading Language
 * Sobol direction vectors
 * Matrix inversion code from OpenEXR
 * MD5 Hash code
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -380,7 +380,10 @@ void PathTrace::path_trace(RenderWork &render_work)
    PathTraceWork *path_trace_work = path_trace_works_[i].get();

    PathTraceWork::RenderStatistics statistics;
-    path_trace_work->render_samples(statistics, render_work.path_trace.start_sample, num_samples);
+    path_trace_work->render_samples(statistics,
+                                    render_work.path_trace.start_sample,
+                                    num_samples,
+                                    render_work.path_trace.sample_offset);

    const double work_time = time_dt() - work_start_time;
    work_balance_infos_[i].time_spent += work_time;
@@ -850,7 +853,8 @@ void PathTrace::progress_update_if_needed(const RenderWork &render_work)
    const uint64_t num_samples_added = uint64_t(tile_size.x) * tile_size.y *
                                       render_work.path_trace.num_samples;
    const int current_sample = render_work.path_trace.start_sample +
-                               render_work.path_trace.num_samples;
+                               render_work.path_trace.num_samples -
+                               render_work.path_trace.sample_offset;
    progress_->add_samples(num_samples_added, current_sample);
  }

--- a/intern/cycles/integrator/path_trace_display.h
+++ b/intern/cycles/integrator/path_trace_display.h
@@ -76,7 +76,7 @@ class PathTraceDisplay {

  /* Copy buffer of rendered pixels of a given size into a given position of the texture.
   *
-   * This function does not acquire a lock. The reason for this is is to allow use of this function
+   * This function does not acquire a lock. The reason for this is to allow use of this function
   * for partial updates from different devices. In this case the caller will acquire the lock
   * once, update all the slices and release
   * the lock once. This will ensure that draw() will never use partially updated texture. */
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -75,7 +75,10 @@ class PathTraceWork {

  /* Render given number of samples as a synchronous blocking call.
   * The samples are added to the render buffer associated with this work. */
-  virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num) = 0;
+  virtual void render_samples(RenderStatistics &statistics,
+                              int start_sample,
+                              int samples_num,
+                              int sample_offset) = 0;

  /* Copy render result from this work to the corresponding place of the GPU display.
   *
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -71,7 +71,8 @@ void PathTraceWorkCPU::init_execution()

 void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
                                      int start_sample,
-                                      int samples_num)
+                                      int samples_num,
+                                      int sample_offset)
 {
  const int64_t image_width = effective_buffer_params_.width;
  const int64_t image_height = effective_buffer_params_.height;
@@ -99,6 +100,7 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
      work_tile.w = 1;
      work_tile.h = 1;
      work_tile.start_sample = start_sample;
+      work_tile.sample_offset = sample_offset;
      work_tile.num_samples = 1;
      work_tile.offset = effective_buffer_params_.offset;
      work_tile.stride = effective_buffer_params_.stride;
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -48,7 +48,8 @@ class PathTraceWorkCPU : public PathTraceWork {

  virtual void render_samples(RenderStatistics &statistics,
                              int start_sample,
-                              int samples_num) override;
+                              int samples_num,
+                              int sample_offset) override;

  virtual void copy_to_display(PathTraceDisplay *display,
                               PassMode pass_mode,
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -250,7 +250,8 @@ void PathTraceWorkGPU::init_execution()

 void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
                                      int start_sample,
-                                      int samples_num)
+                                      int samples_num,
+                                      int sample_offset)
 {
  /* Limit number of states for the tile and rely on a greedy scheduling of tiles. This allows to
   * add more work (because tiles are smaller, so there is higher chance that more paths will
@@ -261,6 +262,7 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
  work_tile_scheduler_.reset(effective_buffer_params_,
                             start_sample,
                             samples_num,
+                             sample_offset,
                             device_scene_->data.integrator.scrambling_distance);

  enqueue_reset();
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -46,7 +46,8 @@ class PathTraceWorkGPU : public PathTraceWork {

  virtual void render_samples(RenderStatistics &statistics,
                              int start_sample,
-                              int samples_num) override;
+                              int samples_num,
+                              int sample_offset) override;

  virtual void copy_to_display(PathTraceDisplay *display,
                               PassMode pass_mode,
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -88,6 +88,16 @@ int RenderScheduler::get_num_samples() const
  return num_samples_;
 }

+void RenderScheduler::set_sample_offset(int sample_offset)
+{
+  sample_offset_ = sample_offset;
+}
+
+int RenderScheduler::get_sample_offset() const
+{
+  return sample_offset_;
+}
+
 void RenderScheduler::set_time_limit(double time_limit)
 {
  time_limit_ = time_limit;
@@ -110,13 +120,15 @@ int RenderScheduler::get_num_rendered_samples() const
  return state_.num_rendered_samples;
 }

-void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
+void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples, int sample_offset)
 {
  buffer_params_ = buffer_params;

  update_start_resolution_divider();

  set_num_samples(num_samples);
+  set_start_sample(sample_offset);
+  set_sample_offset(sample_offset);

  /* In background mode never do lower resolution render preview, as it is not really supported
   * by the software. */
@@ -171,7 +183,7 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)

 void RenderScheduler::reset_for_next_tile()
 {
-  reset(buffer_params_, num_samples_);
+  reset(buffer_params_, num_samples_, sample_offset_);
 }

 bool RenderScheduler::render_work_reschedule_on_converge(RenderWork &render_work)
@@ -317,6 +329,7 @@ RenderWork RenderScheduler::get_render_work()

  render_work.path_trace.start_sample = get_start_sample_to_path_trace();
  render_work.path_trace.num_samples = get_num_samples_to_path_trace();
+  render_work.path_trace.sample_offset = get_sample_offset();

  render_work.init_render_buffers = (render_work.path_trace.start_sample == get_start_sample());

@@ -835,7 +848,7 @@ int RenderScheduler::get_num_samples_to_path_trace() const
     * When time limit is not used the number of samples per render iteration is either increasing
     * or stays the same, so there is no need to clamp number of samples calculated for occupancy.
     */
-    if (time_limit_ && state_.start_render_time) {
+    if (time_limit_ != 0.0 && state_.start_render_time != 0.0) {
      const double remaining_render_time = max(
          0.0, time_limit_ - (time_dt() - state_.start_render_time));
      const double time_per_sample_average = path_trace_time_.get_average();
--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -39,6 +39,7 @@ class RenderWork {
  struct {
    int start_sample = 0;
    int num_samples = 0;
+    int sample_offset = 0;
  } path_trace;

  struct {
@@ -125,6 +126,9 @@ class RenderScheduler {
  void set_num_samples(int num_samples);
  int get_num_samples() const;

+  void set_sample_offset(int sample_offset);
+  int get_sample_offset() const;
+
  /* Time limit for the path tracing tasks, in minutes.
   * Zero disables the limit. */
  void set_time_limit(double time_limit);
@@ -150,7 +154,7 @@ class RenderScheduler {

  /* Reset scheduler, indicating that rendering will happen from scratch.
   * Resets current rendered state, as well as scheduling information. */
-  void reset(const BufferParams &buffer_params, int num_samples);
+  void reset(const BufferParams &buffer_params, int num_samples, int sample_offset);

  /* Reset scheduler upon switching to a next tile.
   * Will keep the same number of samples and full-frame render parameters, but will reset progress
@@ -419,6 +423,8 @@ class RenderScheduler {
  int start_sample_ = 0;
  int num_samples_ = 0;

+  int sample_offset_ = 0;
+
  /* Limit in seconds for how long path tracing is allowed to happen.
   * Zero means no limit is applied. */
  double time_limit_ = 0.0;
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -36,6 +36,7 @@ void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
 void WorkTileScheduler::reset(const BufferParams &buffer_params,
                              int sample_start,
                              int samples_num,
+                              int sample_offset,
                              float scrambling_distance)
 {
  /* Image buffer parameters. */
@@ -51,6 +52,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
  /* Samples parameters. */
  sample_start_ = sample_start;
  samples_num_ = samples_num;
+  sample_offset_ = sample_offset;

  /* Initialize new scheduling. */
  reset_scheduler_state();
@@ -111,6 +113,7 @@ bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_
  work_tile.h = tile_size_.height;
  work_tile.start_sample = sample_start_ + start_sample;
  work_tile.num_samples = min(tile_size_.num_samples, samples_num_ - start_sample);
+  work_tile.sample_offset = sample_offset_;
  work_tile.offset = offset_;
  work_tile.stride = stride_;

--- a/intern/cycles/integrator/work_tile_scheduler.h
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -41,6 +41,7 @@ class WorkTileScheduler {
  void reset(const BufferParams &buffer_params,
             int sample_start,
             int samples_num,
+             int sample_offset,
             float scrambling_distance);

  /* Get work for a device.
@@ -79,6 +80,7 @@ class WorkTileScheduler {
   * (splitting into a smaller work tiles). */
  int sample_start_ = 0;
  int samples_num_ = 0;
+  int sample_offset_ = 0;

  /* Tile size which be scheduled for rendering. */
  TileSize tile_size_;
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -39,6 +39,10 @@ set(SRC_KERNEL_DEVICE_HIP
  device/hip/kernel.cpp
 )

+set(SRC_KERNEL_DEVICE_METAL
+  device/metal/kernel.metal
+)
+
 set(SRC_KERNEL_DEVICE_OPTIX
  device/optix/kernel.cu
  device/optix/kernel_shader_raytrace.cu
@@ -79,6 +83,13 @@ set(SRC_KERNEL_DEVICE_OPTIX_HEADERS
  device/optix/globals.h
 )

+set(SRC_KERNEL_DEVICE_METAL_HEADERS
+  device/metal/compat.h
+  device/metal/context_begin.h
+  device/metal/context_end.h
+  device/metal/globals.h
+)
+
 set(SRC_KERNEL_CLOSURE_HEADERS
  closure/alloc.h
  closure/bsdf.h
@@ -262,6 +273,7 @@ set(SRC_KERNEL_UTIL_HEADERS
 )

 set(SRC_KERNEL_TYPES_HEADERS
+  tables.h
  textures.h
  types.h
 )
@@ -399,12 +411,8 @@ if(WITH_CYCLES_CUDA_BINARIES)
      -I ${CMAKE_CURRENT_SOURCE_DIR}/..
      -I ${CMAKE_CURRENT_SOURCE_DIR}/device/cuda
      --use_fast_math
-      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file})
-
-    if(${experimental})
-      set(cuda_flags ${cuda_flags} -D __KERNEL_EXPERIMENTAL__)
-      set(name ${name}_experimental)
-    endif()
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${cuda_file}
+      -Wno-deprecated-gpu-targets)

    if(WITH_NANOVDB)
      set(cuda_flags ${cuda_flags}
@@ -412,6 +420,10 @@ if(WITH_CYCLES_CUDA_BINARIES)
        -I "${NANOVDB_INCLUDE_DIR}")
    endif()

+    if(WITH_CYCLES_DEBUG)
+      set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG)
+    endif()
+
    if(WITH_CYCLES_CUBIN_COMPILER)
      string(SUBSTRING ${arch} 3 -1 CUDA_ARCH)

@@ -560,11 +572,6 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
      -ffast-math
      -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file})

-    if(${experimental})
-      set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__)
-      set(name ${name}_experimental)
-    endif()
-
    if(WITH_NANOVDB)
      set(hip_flags ${hip_flags}
        -D WITH_NANOVDB
@@ -572,7 +579,7 @@ if(WITH_CYCLES_HIP_BINARIES AND WITH_CYCLES_DEVICE_HIP)
    endif()

    if(WITH_CYCLES_DEBUG)
-      set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__)
+      set(hip_flags ${hip_flags} -D WITH_CYCLES_DEBUG)
    endif()

    add_custom_command(
@@ -613,6 +620,10 @@ if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
        -I "${NANOVDB_INCLUDE_DIR}")
    endif()

+    if(WITH_CYCLES_DEBUG)
+      set(cuda_flags ${cuda_flags} -D WITH_CYCLES_DEBUG)
+    endif()
+
    if(WITH_CYCLES_CUBIN_COMPILER)
      # Needed to find libnvrtc-builtins.so. Can't do it from inside
      # cycles_cubin_cc since the env variable is read before main()
@@ -701,7 +712,7 @@ if(WITH_COMPILER_ASAN)
    string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-sanitize=all")
    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-sanitize=vptr")
  elseif(CMAKE_C_COMPILER_ID MATCHES "Clang")
-    # With OSL, Cycles disables rtti in some modules, wich then breaks at linking
+    # With OSL, Cycles disables rtti in some modules, which then breaks at linking
    # when trying to use vptr sanitizer (included into 'undefined' general option).
    string(APPEND CMAKE_CXX_FLAGS_RELWITHDEBINFO " -fno-sanitize=vptr")
    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-sanitize=vptr")
@@ -729,12 +740,14 @@ cycles_add_library(cycles_kernel "${LIB}"
  ${SRC_KERNEL_DEVICE_CUDA}
  ${SRC_KERNEL_DEVICE_HIP}
  ${SRC_KERNEL_DEVICE_OPTIX}
+  ${SRC_KERNEL_DEVICE_METAL}
  ${SRC_KERNEL_HEADERS}
  ${SRC_KERNEL_DEVICE_CPU_HEADERS}
  ${SRC_KERNEL_DEVICE_GPU_HEADERS}
  ${SRC_KERNEL_DEVICE_CUDA_HEADERS}
  ${SRC_KERNEL_DEVICE_HIP_HEADERS}
  ${SRC_KERNEL_DEVICE_OPTIX_HEADERS}
+  ${SRC_KERNEL_DEVICE_METAL_HEADERS}
 )

 source_group("bake" FILES ${SRC_KERNEL_BAKE_HEADERS})
@@ -746,6 +759,7 @@ source_group("device\\cuda" FILES ${SRC_KERNEL_DEVICE_CUDA} ${SRC_KERNEL_DEVICE_
 source_group("device\\gpu" FILES ${SRC_KERNEL_DEVICE_GPU_HEADERS})
 source_group("device\\hip" FILES ${SRC_KERNEL_DEVICE_HIP} ${SRC_KERNEL_DEVICE_HIP_HEADERS})
 source_group("device\\optix" FILES ${SRC_KERNEL_DEVICE_OPTIX} ${SRC_KERNEL_DEVICE_OPTIX_HEADERS})
+source_group("device\\metal" FILES ${SRC_KERNEL_DEVICE_METAL} ${SRC_KERNEL_DEVICE_METAL_HEADERS})
 source_group("film" FILES ${SRC_KERNEL_FILM_HEADERS})
 source_group("geom" FILES ${SRC_KERNEL_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_KERNEL_INTEGRATOR_HEADERS})
@@ -778,6 +792,8 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP}" ${CYCLES_
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_DEVICE_METAL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/metal)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_FILM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/film)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNEL_INTEGRATOR_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/integrator)
--- a/intern/cycles/kernel/bvh/util.h
+++ b/intern/cycles/kernel/bvh/util.h
@@ -97,7 +97,7 @@ ccl_device_inline void sort_intersections_and_normals(ccl_private Intersection *
    swapped = false;
    for (int j = 0; j < num_hits - 1; ++j) {
      if (hits[j].t > hits[j + 1].t) {
-        struct Intersection tmp_hit = hits[j];
+        Intersection tmp_hit = hits[j];
        float3 tmp_Ng = Ng[j];
        hits[j] = hits[j + 1];
        Ng[j] = Ng[j + 1];
--- a/intern/cycles/kernel/device/cpu/globals.h
+++ b/intern/cycles/kernel/device/cpu/globals.h
@@ -18,6 +18,7 @@

 #pragma once

+#include "kernel/tables.h"
 #include "kernel/types.h"
 #include "kernel/util/profiling.h"

--- a/intern/cycles/kernel/device/cuda/compat.h
+++ b/intern/cycles/kernel/device/cuda/compat.h
@@ -52,8 +52,9 @@ typedef unsigned long long uint64_t;
 #endif
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -75,6 +76,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -84,7 +86,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)

 /* GPU texture objects */

--- a/intern/cycles/kernel/device/cuda/config.h
+++ b/intern/cycles/kernel/device/cuda/config.h
@@ -92,12 +92,29 @@

 /* Compute number of threads per block and minimum blocks per multiprocessor
 * given the maximum number of registers per thread. */
-
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
  extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                   (block_num_threads * thread_num_registers))

+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */

 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
--- a/intern/cycles/kernel/device/gpu/image.h
+++ b/intern/cycles/kernel/device/gpu/image.h
@@ -65,7 +65,9 @@ ccl_device float cubic_h1(float a)

 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
 template<typename T>
-ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, float y)
+ccl_device_noinline T kernel_tex_image_interp_bicubic(ccl_global const TextureInfo &info,
+                                                      float x,
+                                                      float y)
 {
  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;

@@ -94,7 +96,7 @@ ccl_device_noinline T kernel_tex_image_interp_bicubic(const TextureInfo &info, f
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
 ccl_device_noinline T
-kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
+kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, float y, float z)
 {
  ccl_gpu_tex_object tex = (ccl_gpu_tex_object)info.data;

@@ -169,7 +171,7 @@ ccl_device T kernel_tex_image_interp_tricubic_nanovdb(S &s, float x, float y, fl

 template<typename T>
 ccl_device_noinline T kernel_tex_image_interp_nanovdb(
-    const TextureInfo &info, float x, float y, float z, uint interpolation)
+    ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
 {
  using namespace nanovdb;

@@ -191,7 +193,7 @@ ccl_device_noinline T kernel_tex_image_interp_nanovdb(

 ccl_device float4 kernel_tex_image_interp(KernelGlobals kg, int id, float x, float y)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);

  /* float4, byte4, ushort4 and half4 */
  const int texture_type = info.data_type;
@@ -226,7 +228,7 @@ ccl_device float4 kernel_tex_image_interp_3d(KernelGlobals kg,
                                             float3 P,
                                             InterpolationType interp)
 {
-  const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
+  ccl_global const TextureInfo &info = kernel_tex_fetch(__texture_info, id);

  if (info.use_transform_3d) {
    P = transform_point(&info.transform_3d, P);
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -31,10 +31,43 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
 #endif

+#ifdef __KERNEL_METAL__
+struct ActiveIndexContext {
+  ActiveIndexContext(int _thread_index,
+                     int _global_index,
+                     int _threadgroup_size,
+                     int _simdgroup_size,
+                     int _simd_lane_index,
+                     int _simd_group_index,
+                     int _num_simd_groups,
+                     threadgroup int *_simdgroup_offset)
+      : thread_index(_thread_index),
+        global_index(_global_index),
+        blocksize(_threadgroup_size),
+        ccl_gpu_warp_size(_simdgroup_size),
+        thread_warp(_simd_lane_index),
+        warp_index(_simd_group_index),
+        num_warps(_num_simd_groups),
+        warp_offset(_simdgroup_offset)
+  {
+  }
+
+  const int thread_index, global_index, blocksize, ccl_gpu_warp_size, thread_warp, warp_index,
+      num_warps;
+  threadgroup int *warp_offset;
+
+  template<uint blocksizeDummy, typename IsActiveOp>
+  void active_index_array(const uint num_states,
+                          ccl_global int *indices,
+                          ccl_global int *num_indices,
+                          IsActiveOp is_active_op)
+  {
+    const uint state_index = global_index;
+#else
 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
-                                                int *indices,
-                                                int *num_indices,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
                                                IsActiveOp is_active_op)
 {
  extern ccl_gpu_shared int warp_offset[];
@@ -45,43 +78,62 @@ __device__ void gpu_parallel_active_index_array(const uint num_states,
  const uint warp_index = thread_index / ccl_gpu_warp_size;
  const uint num_warps = blocksize / ccl_gpu_warp_size;

-  /* Test if state corresponding to this thread is active. */
  const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
-  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
+#endif

-  /* For each thread within a warp compute how many other active states precede it. */
-  const uint thread_mask = 0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp);
-  const uint thread_offset = ccl_gpu_popc(ccl_gpu_ballot(is_active) & thread_mask);
+    /* Test if state corresponding to this thread is active. */
+    const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;

-  /* Last thread in warp stores number of active states for each warp. */
-  if (thread_warp == ccl_gpu_warp_size - 1) {
-    warp_offset[warp_index] = thread_offset + is_active;
-  }
+    /* For each thread within a warp compute how many other active states precede it. */
+    const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
+                                        ccl_gpu_thread_mask(thread_warp));

-  ccl_gpu_syncthreads();
-
-  /* Last thread in block converts per-warp sizes to offsets, increments global size of
-   * index array and gets offset to write to. */
-  if (thread_index == blocksize - 1) {
-    /* TODO: parallelize this. */
-    int offset = 0;
-    for (int i = 0; i < num_warps; i++) {
-      int num_active = warp_offset[i];
-      warp_offset[i] = offset;
-      offset += num_active;
+    /* Last thread in warp stores number of active states for each warp. */
+    if (thread_warp == ccl_gpu_warp_size - 1) {
+      warp_offset[warp_index] = thread_offset + is_active;
    }

-    const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
-    warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+    ccl_gpu_syncthreads();
+
+    /* Last thread in block converts per-warp sizes to offsets, increments global size of
+     * index array and gets offset to write to. */
+    if (thread_index == blocksize - 1) {
+      /* TODO: parallelize this. */
+      int offset = 0;
+      for (int i = 0; i < num_warps; i++) {
+        int num_active = warp_offset[i];
+        warp_offset[i] = offset;
+        offset += num_active;
+      }
+
+      const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
+      warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
+    }
+
+    ccl_gpu_syncthreads();
+
+    /* Write to index array. */
+    if (is_active) {
+      const uint block_offset = warp_offset[num_warps];
+      indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
+    }
  }

-  ccl_gpu_syncthreads();
+#ifdef __KERNEL_METAL__
+}; /* end class ActiveIndexContext */

-  /* Write to index array. */
-  if (is_active) {
-    const uint block_offset = warp_offset[num_warps];
-    indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
-  }
-}
+/* inject the required thread params into a struct, and redirect to its templated member function
+ */
+#  define gpu_parallel_active_index_array \
+    ActiveIndexContext(metal_local_id, \
+                       metal_global_id, \
+                       metal_local_size, \
+                       simdgroup_size, \
+                       simd_lane_index, \
+                       simd_group_index, \
+                       num_simd_groups, \
+                       simdgroup_offset) \
+        .active_index_array
+#endif

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -33,10 +33,12 @@ CCL_NAMESPACE_BEGIN
 #  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
 #endif

-template<uint blocksize>
-__device__ void gpu_parallel_prefix_sum(int *counter, int *prefix_sum, const int num_values)
+__device__ void gpu_parallel_prefix_sum(const int global_id,
+                                        ccl_global int *counter,
+                                        ccl_global int *prefix_sum,
+                                        const int num_values)
 {
-  if (!(ccl_gpu_block_idx_x == 0 && ccl_gpu_thread_idx_x == 0)) {
+  if (global_id != 0) {
    return;
  }

--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -33,16 +33,16 @@ CCL_NAMESPACE_BEGIN
 #endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)

-template<uint blocksize, typename GetKeyOp>
-__device__ void gpu_parallel_sorted_index_array(const uint num_states,
+template<typename GetKeyOp>
+__device__ void gpu_parallel_sorted_index_array(const uint state_index,
+                                                const uint num_states,
                                                const int num_states_limit,
-                                                int *indices,
-                                                int *num_indices,
-                                                int *key_counter,
-                                                int *key_prefix_sum,
+                                                ccl_global int *indices,
+                                                ccl_global int *num_indices,
+                                                ccl_global int *key_counter,
+                                                ccl_global int *key_prefix_sum,
                                                GetKeyOp get_key_op)
 {
-  const uint state_index = ccl_gpu_block_idx_x * blocksize + ccl_gpu_thread_idx_x;
  const int key = (state_index < num_states) ? get_key_op(state_index) :
                                               GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY;

--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -45,8 +45,9 @@ typedef unsigned long long uint64_t;
 #define ccl_device_forceinline __device__ __forceinline__
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -74,6 +75,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -83,7 +85,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot(predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
-#define ccl_gpu_popc(x) __popc(x)

 /* GPU texture objects */
 typedef hipTextureObject_t ccl_gpu_tex_object;
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -35,12 +35,29 @@

 /* Compute number of threads per block and minimum blocks per multiprocessor
 * given the maximum number of registers per thread. */
-
 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
  extern "C" __global__ void __launch_bounds__(block_num_threads, \
                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
                                                   (block_num_threads * thread_num_registers))

+#define ccl_gpu_kernel_threads(block_num_threads) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads)
+
+#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
+
+#define ccl_gpu_kernel_call(x) x
+
+/* Define a function object where "func" is the lambda body, and additional parameters are used to
+ * specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda { \
+    __VA_ARGS__; \
+    __device__ int operator()(const int state) \
+    { \
+      return (func); \
+    } \
+  } ccl_gpu_kernel_lambda_pass
+
 /* sanity checks */

 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
--- a/intern/cycles/kernel/device/metal/compat.h
+++ b/intern/cycles/kernel/device/metal/compat.h
@@ -34,6 +34,7 @@ using namespace metal;

 #pragma clang diagnostic ignored "-Wunused-variable"
 #pragma clang diagnostic ignored "-Wsign-compare"
+#pragma clang diagnostic ignored "-Wuninitialized"

 /* Qualifiers */

@@ -42,8 +43,9 @@ using namespace metal;
 #define ccl_device_forceinline ccl_device
 #define ccl_device_noinline ccl_device __attribute__((noinline))
 #define ccl_device_noinline_cpu ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_global device
-#define ccl_static_constant static constant constexpr
+#define ccl_inline_constant static constant constexpr
 #define ccl_device_constant constant
 #define ccl_constant const device
 #define ccl_gpu_shared threadgroup
@@ -58,6 +60,122 @@ using namespace metal;

 #define kernel_assert(cond)

+#define ccl_gpu_global_id_x() metal_global_id
+#define ccl_gpu_warp_size simdgroup_size
+#define ccl_gpu_thread_idx_x simd_group_index
+#define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
+
+#define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
+#define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);
+
+// clang-format off
+
+/* kernel.h adapters */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
+#define ccl_gpu_kernel_threads(block_num_threads)
+
+/* Convert a comma-separated list into a semicolon-separated list
+ * (so that we can generate a struct based on kernel entry-point parameters). */
+#define FN0()
+#define FN1(p1) p1;
+#define FN2(p1, p2) p1; p2;
+#define FN3(p1, p2, p3) p1; p2; p3;
+#define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
+#define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
+#define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
+#define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
+#define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
+#define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
+#define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
+#define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
+#define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
+#define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
+#define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
+#define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
+#define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
+#define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, ...) p16
+#define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
+
+/* Generate a struct containing the entry-point parameters and a "run"
+ * method which can access them implicitly via this-> */
+#define ccl_gpu_kernel_signature(name, ...) \
+struct kernel_gpu_##name \
+{ \
+  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
+  void run(thread MetalKernelContext& context, \
+           threadgroup int *simdgroup_offset, \
+           const uint metal_global_id, \
+           const ushort metal_local_id, \
+           const ushort metal_local_size, \
+           uint simdgroup_size, \
+           uint simd_lane_index, \
+           uint simd_group_index, \
+           uint num_simd_groups) ccl_global const; \
+}; \
+kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
+                                constant KernelParamsMetal &ccl_restrict   _launch_params_metal, \
+                                constant MetalAncillaries *_metal_ancillaries, \
+                                threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
+                                const uint metal_global_id [[thread_position_in_grid]], \
+                                const ushort metal_local_id   [[thread_position_in_threadgroup]], \
+                                const ushort metal_local_size [[threads_per_threadgroup]], \
+                                uint simdgroup_size [[threads_per_simdgroup]], \
+                                uint simd_lane_index [[thread_index_in_simdgroup]], \
+                                uint simd_group_index [[simdgroup_index_in_threadgroup]], \
+                                uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
+  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
+  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
+} \
+void kernel_gpu_##name::run(thread MetalKernelContext& context, \
+                  threadgroup int *simdgroup_offset, \
+                  const uint metal_global_id, \
+                  const ushort metal_local_id, \
+                  const ushort metal_local_size, \
+                  uint simdgroup_size, \
+                  uint simd_lane_index, \
+                  uint simd_group_index, \
+                  uint num_simd_groups) ccl_global const
+
+#define ccl_gpu_kernel_call(x) context.x
+
+/* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state  */
+#define ccl_gpu_kernel_lambda(func, ...) \
+  struct KernelLambda \
+  { \
+    KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
+    ccl_private MetalKernelContext &context; \
+    __VA_ARGS__; \
+    int operator()(const int state) const { return (func); } \
+  } ccl_gpu_kernel_lambda_pass(context)
+
+// clang-format on
+
+/* volumetric lambda functions - use function objects for lambda-like functionality */
+#define VOLUME_READ_LAMBDA(function_call) \
+  struct FnObjectRead { \
+    KernelGlobals kg; \
+    ccl_private MetalKernelContext *context; \
+    int state; \
+\
+    VolumeStack operator()(const int i) const \
+    { \
+      return context->function_call; \
+    } \
+  } volume_read_lambda_pass{kg, this, state};
+
+#define VOLUME_WRITE_LAMBDA(function_call) \
+  struct FnObjectWrite { \
+    KernelGlobals kg; \
+    ccl_private MetalKernelContext *context; \
+    int state; \
+\
+    void operator()(const int i, VolumeStack entry) const \
+    { \
+      context->function_call; \
+    } \
+  } volume_write_lambda_pass{kg, this, state};
+
 /* make_type definitions with Metal style element initializers */
 #ifdef make_float2
 #  undef make_float2
@@ -112,6 +230,7 @@ using namespace metal;
 #define sinhf(x) sinh(float(x))
 #define coshf(x) cosh(float(x))
 #define tanhf(x) tanh(float(x))
+#define saturatef(x) saturate(float(x))

 /* Use native functions with possibly lower precision for performance,
 * no issues found so far. */
@@ -124,3 +243,43 @@ using namespace metal;
 #define logf(x) trigmode::log(float(x))

 #define NULL 0
+
+#define __device__
+
+/* texture bindings and sampler setup */
+
+struct Texture2DParamsMetal {
+  texture2d<float, access::sample> tex;
+};
+struct Texture3DParamsMetal {
+  texture3d<float, access::sample> tex;
+};
+
+struct MetalAncillaries {
+  device Texture2DParamsMetal *textures_2d;
+  device Texture3DParamsMetal *textures_3d;
+};
+
+#include "util/half.h"
+#include "util/types.h"
+
+enum SamplerType {
+  SamplerFilterNearest_AddressRepeat,
+  SamplerFilterNearest_AddressClampEdge,
+  SamplerFilterNearest_AddressClampZero,
+
+  SamplerFilterLinear_AddressRepeat,
+  SamplerFilterLinear_AddressClampEdge,
+  SamplerFilterLinear_AddressClampZero,
+
+  SamplerCount
+};
+
+constant constexpr array<sampler, SamplerCount> metal_samplers = {
+    sampler(address::repeat, filter::nearest),
+    sampler(address::clamp_to_edge, filter::nearest),
+    sampler(address::clamp_to_zero, filter::nearest),
+    sampler(address::repeat, filter::linear),
+    sampler(address::clamp_to_edge, filter::linear),
+    sampler(address::clamp_to_zero, filter::linear),
+};
--- a/intern/cycles/kernel/device/metal/context_begin.h
+++ b/intern/cycles/kernel/device/metal/context_begin.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// clang-format off
+
+/* Open the Metal kernel context class
+ * Necessary to access resource bindings */
+class MetalKernelContext {
+  public:
+    constant KernelParamsMetal &launch_params_metal;
+    constant MetalAncillaries *metal_ancillaries;
+
+    MetalKernelContext(constant KernelParamsMetal &_launch_params_metal, constant MetalAncillaries * _metal_ancillaries)
+    : launch_params_metal(_launch_params_metal), metal_ancillaries(_metal_ancillaries)
+    {}
+
+    /* texture fetch adapter functions */
+    typedef uint64_t ccl_gpu_tex_object;
+
+    template<typename T>
+    inline __attribute__((__always_inline__))
+    T ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      kernel_assert(0);
+      return 0;
+    }
+    template<typename T>
+    inline __attribute__((__always_inline__))
+    T ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      kernel_assert(0);
+      return 0;
+    }
+
+    // texture2d
+    template<>
+    inline __attribute__((__always_inline__))
+    float4 ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y));
+    }
+    template<>
+    inline __attribute__((__always_inline__))
+    float ccl_gpu_tex_object_read_2D(ccl_gpu_tex_object tex, float x, float y) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_2d[tid].tex.sample(metal_samplers[sid], float2(x, y)).x;
+    }
+
+    // texture3d
+    template<>
+    inline __attribute__((__always_inline__))
+    float4 ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z));
+    }
+    template<>
+    inline __attribute__((__always_inline__))
+    float ccl_gpu_tex_object_read_3D(ccl_gpu_tex_object tex, float x, float y, float z) const {
+      const uint tid(tex);
+      const uint sid(tex >> 32);
+      return metal_ancillaries->textures_3d[tid].tex.sample(metal_samplers[sid], float3(x, y, z)).x;
+    }
+#    include "kernel/device/gpu/image.h"
+
+  // clang-format on
--- a/intern/cycles/kernel/device/metal/context_end.h
+++ b/intern/cycles/kernel/device/metal/context_end.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+}
+; /* end of MetalKernelContext class definition */
+
+/* Silently redirect into the MetalKernelContext instance */
+/* NOTE: These macros will need maintaining as entry-points change. */
+
+#undef kernel_integrator_state
+#define kernel_integrator_state context.launch_params_metal.__integrator_state
--- a/intern/cycles/kernel/device/metal/globals.h
+++ b/intern/cycles/kernel/device/metal/globals.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#include "kernel/types.h"
+#include "kernel/util/profiling.h"
+
+#include "kernel/integrator/state.h"
+
+CCL_NAMESPACE_BEGIN
+
+typedef struct KernelParamsMetal {
+
+#define KERNEL_TEX(type, name) ccl_global const type *name;
+#include "kernel/textures.h"
+#undef KERNEL_TEX
+
+  const IntegratorStateGPU __integrator_state;
+  const KernelData data;
+
+} KernelParamsMetal;
+
+typedef struct KernelGlobalsGPU {
+  int unused[1];
+} KernelGlobalsGPU;
+
+typedef ccl_global const KernelGlobalsGPU *ccl_restrict KernelGlobals;
+
+#define kernel_data launch_params_metal.data
+#define kernel_integrator_state launch_params_metal.__integrator_state
+
+/* data lookup defines */
+
+#define kernel_tex_fetch(tex, index) launch_params_metal.tex[index]
+#define kernel_tex_array(tex) launch_params_metal.tex
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/metal/kernel.metal
+++ b/intern/cycles/kernel/device/metal/kernel.metal
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Metal kernel entry points */
+
+// clang-format off
+
+#include "kernel/device/metal/compat.h"
+#include "kernel/device/metal/globals.h"
+#include "kernel/device/gpu/kernel.h"
+
+// clang-format on
--- a/intern/cycles/kernel/device/optix/compat.h
+++ b/intern/cycles/kernel/device/optix/compat.h
@@ -49,10 +49,11 @@ typedef unsigned long long uint64_t;
  __device__ __forceinline__  // Function calls are bad for OptiX performance, so inline everything
 #define ccl_device_inline ccl_device
 #define ccl_device_forceinline ccl_device
+#define ccl_device_inline_method ccl_device
 #define ccl_device_noinline __device__ __noinline__
 #define ccl_device_noinline_cpu ccl_device
 #define ccl_global
-#define ccl_static_constant __constant__
+#define ccl_inline_constant __constant__
 #define ccl_device_constant __constant__ __device__
 #define ccl_constant const
 #define ccl_gpu_shared __shared__
@@ -76,6 +77,7 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_block_idx_x (blockIdx.x)
 #define ccl_gpu_grid_dim_x (gridDim.x)
 #define ccl_gpu_warp_size (warpSize)
+#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))

 #define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
 #define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
@@ -85,7 +87,6 @@ typedef unsigned long long uint64_t;
 #define ccl_gpu_syncthreads() __syncthreads()
 #define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
 #define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
-#define ccl_gpu_popc(x) __popc(x)

 /* GPU texture objects */

--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -21,6 +21,8 @@

 #include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */

+#include "kernel/tables.h"
+
 #include "kernel/integrator/state.h"
 #include "kernel/integrator/state_flow.h"
 #include "kernel/integrator/state_util.h"
@@ -44,7 +46,7 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
 ccl_device_forceinline int get_object_id()
 {
 #ifdef __OBJECT_MOTION__
-  /* Always get the the instance ID from the TLAS
+  /* Always get the instance ID from the TLAS
   * There might be a motion transform node between TLAS and BLAS which does not have one. */
  return optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
 #else
@@ -159,9 +161,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()

  /* Record geometric normal. */
  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0));
-  const float3 tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1));
-  const float3 tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0);
+  const float3 tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1);
+  const float3 tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));

  /* Continue tracing (without this the trace call would return after the first hit). */
--- a/intern/cycles/kernel/film/accumulate.h
+++ b/intern/cycles/kernel/film/accumulate.h
@@ -151,7 +151,8 @@ ccl_device_forceinline ccl_global float *kernel_accum_pixel_render_buffer(
 ccl_device_inline int kernel_accum_sample(KernelGlobals kg,
                                          ConstIntegratorState state,
                                          ccl_global float *ccl_restrict render_buffer,
-                                          int sample)
+                                          int sample,
+                                          int sample_offset)
 {
  if (kernel_data.film.pass_sample_count == PASS_UNUSED) {
    return sample;
@@ -159,7 +160,9 @@ ccl_device_inline int kernel_accum_sample(KernelGlobals kg,

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);

-  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
+  return atomic_fetch_and_add_uint32(
+             (ccl_global uint *)(buffer) + kernel_data.film.pass_sample_count, 1) +
+         sample_offset;
 }

 ccl_device void kernel_accum_adaptive_buffer(KernelGlobals kg,
@@ -550,7 +553,7 @@ ccl_device_inline void kernel_accum_background(KernelGlobals kg,
                                               const bool is_transparent_background_ray,
                                               ccl_global float *ccl_restrict render_buffer)
 {
-  float3 contribution = INTEGRATOR_STATE(state, path, throughput) * L;
+  float3 contribution = float3(INTEGRATOR_STATE(state, path, throughput)) * L;
  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(state, path, bounce) - 1);

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(kg, state, render_buffer);
--- a/intern/cycles/kernel/geom/attribute.h
+++ b/intern/cycles/kernel/geom/attribute.h
@@ -27,7 +27,12 @@ CCL_NAMESPACE_BEGIN
 * Lookup of attributes is different between OSL and SVM, as OSL is ustring
 * based while for SVM we use integer ids. */

-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd);
+/* Patch index for triangle, -1 if not subdivision triangle */
+
+ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
+{
+  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
+}

 ccl_device_inline uint attribute_primitive_type(KernelGlobals kg, ccl_private const ShaderData *sd)
 {
@@ -106,9 +111,9 @@ ccl_device Transform primitive_attribute_matrix(KernelGlobals kg,
 {
  Transform tfm;

-  tfm.x = kernel_tex_fetch(__attributes_float3, desc.offset + 0);
-  tfm.y = kernel_tex_fetch(__attributes_float3, desc.offset + 1);
-  tfm.z = kernel_tex_fetch(__attributes_float3, desc.offset + 2);
+  tfm.x = kernel_tex_fetch(__attributes_float4, desc.offset + 0);
+  tfm.y = kernel_tex_fetch(__attributes_float4, desc.offset + 1);
+  tfm.z = kernel_tex_fetch(__attributes_float4, desc.offset + 2);

  return tfm;
 }
--- a/intern/cycles/kernel/geom/curve.h
+++ b/intern/cycles/kernel/geom/curve.h
@@ -126,8 +126,8 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
    int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
    int k1 = k0 + 1;

-    float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k0));
-    float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + k1));
+    float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
+    float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);

 #  ifdef __RAY_DIFFERENTIALS__
    if (dx)
@@ -149,7 +149,7 @@ ccl_device float3 curve_attribute_float3(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset));
+      return kernel_tex_fetch(__attributes_float3, offset);
    }
    else {
      return make_float3(0.0f, 0.0f, 0.0f);
@@ -168,8 +168,8 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
    int k0 = curve.first_key + PRIMITIVE_UNPACK_SEGMENT(sd->type);
    int k1 = k0 + 1;

-    float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + k0);
-    float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + k1);
+    float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + k0);
+    float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + k1);

 #  ifdef __RAY_DIFFERENTIALS__
    if (dx)
@@ -191,7 +191,7 @@ ccl_device float4 curve_attribute_float4(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_CURVE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_CURVE) ? desc.offset + sd->prim :
                                                                desc.offset;
-      return kernel_tex_fetch(__attributes_float3, offset);
+      return kernel_tex_fetch(__attributes_float4, offset);
    }
    else {
      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
--- a/intern/cycles/kernel/geom/motion_curve.h
+++ b/intern/cycles/kernel/geom/motion_curve.h
@@ -48,8 +48,8 @@ ccl_device_inline void motion_curve_keys_for_step_linear(KernelGlobals kg,

    offset += step * numkeys;

-    keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
-    keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
+    keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
+    keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
  }
 }

@@ -106,10 +106,10 @@ ccl_device_inline void motion_curve_keys_for_step(KernelGlobals kg,

    offset += step * numkeys;

-    keys[0] = kernel_tex_fetch(__attributes_float3, offset + k0);
-    keys[1] = kernel_tex_fetch(__attributes_float3, offset + k1);
-    keys[2] = kernel_tex_fetch(__attributes_float3, offset + k2);
-    keys[3] = kernel_tex_fetch(__attributes_float3, offset + k3);
+    keys[0] = kernel_tex_fetch(__attributes_float4, offset + k0);
+    keys[1] = kernel_tex_fetch(__attributes_float4, offset + k1);
+    keys[2] = kernel_tex_fetch(__attributes_float4, offset + k2);
+    keys[3] = kernel_tex_fetch(__attributes_float4, offset + k3);
  }
 }

--- a/intern/cycles/kernel/geom/motion_triangle.h
+++ b/intern/cycles/kernel/geom/motion_triangle.h
@@ -43,9 +43,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,
 {
  if (step == numsteps) {
    /* center step: regular vertex location */
-    verts[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-    verts[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-    verts[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+    verts[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+    verts[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+    verts[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
  }
  else {
    /* center step not store in this array */
@@ -54,9 +54,9 @@ ccl_device_inline void motion_triangle_verts_for_step(KernelGlobals kg,

    offset += step * numverts;

-    verts[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
-    verts[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
-    verts[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+    verts[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
+    verts[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
+    verts[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
  }
 }

@@ -70,9 +70,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,
 {
  if (step == numsteps) {
    /* center step: regular vertex location */
-    normals[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-    normals[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-    normals[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+    normals[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+    normals[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+    normals[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
  }
  else {
    /* center step is not stored in this array */
@@ -81,9 +81,9 @@ ccl_device_inline void motion_triangle_normals_for_step(KernelGlobals kg,

    offset += step * numverts;

-    normals[0] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x));
-    normals[1] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y));
-    normals[2] = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z));
+    normals[0] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.x);
+    normals[1] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.y);
+    normals[2] = kernel_tex_fetch(__attributes_float3, offset + tri_vindex.z);
  }
 }

--- a/intern/cycles/kernel/geom/motion_triangle_intersect.h
+++ b/intern/cycles/kernel/geom/motion_triangle_intersect.h
@@ -163,19 +163,7 @@ ccl_device_inline bool motion_triangle_intersect(KernelGlobals kg,
  motion_triangle_vertices(kg, fobject, prim, time, verts);
  /* Ray-triangle intersection, unoptimized. */
  float t, u, v;
-  if (ray_triangle_intersect(P,
-                             dir,
-                             tmax,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                             (ssef *)verts,
-#else
-                             verts[0],
-                             verts[1],
-                             verts[2],
-#endif
-                             &u,
-                             &v,
-                             &t)) {
+  if (ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
    /* Visibility flag test. we do it here under the assumption
     * that most triangles are culled by node flags.
@@ -229,19 +217,7 @@ ccl_device_inline bool motion_triangle_intersect_local(KernelGlobals kg,
  motion_triangle_vertices(kg, local_object, prim, time, verts);
  /* Ray-triangle intersection, unoptimized. */
  float t, u, v;
-  if (!ray_triangle_intersect(P,
-                              dir,
-                              tmax,
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                              (ssef *)verts,
-#  else
-                              verts[0],
-                              verts[1],
-                              verts[2],
-#  endif
-                              &u,
-                              &v,
-                              &t)) {
+  if (!ray_triangle_intersect(P, dir, tmax, verts[0], verts[1], verts[2], &u, &v, &t)) {
    return false;
  }

--- a/intern/cycles/kernel/geom/patch.h
+++ b/intern/cycles/kernel/geom/patch.h
@@ -380,7 +380,7 @@ ccl_device float3 patch_eval_float3(KernelGlobals kg,
    *dv = make_float3(0.0f, 0.0f, 0.0f);

  for (int i = 0; i < num_control; i++) {
-    float3 v = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + indices[i]));
+    float3 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);

    val += v * weights[i];
    if (du)
@@ -417,7 +417,7 @@ ccl_device float4 patch_eval_float4(KernelGlobals kg,
    *dv = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

  for (int i = 0; i < num_control; i++) {
-    float4 v = kernel_tex_fetch(__attributes_float3, offset + indices[i]);
+    float4 v = kernel_tex_fetch(__attributes_float4, offset + indices[i]);

    val += v * weights[i];
    if (du)
--- a/intern/cycles/kernel/geom/primitive.h
+++ b/intern/cycles/kernel/geom/primitive.h
@@ -284,18 +284,33 @@ ccl_device_inline float4 primitive_motion_vector(KernelGlobals kg,
    int numverts, numkeys;
    object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);

-    /* lookup attributes */
-    motion_pre = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-
-    desc.offset += (sd->type & PRIMITIVE_ALL_TRIANGLE) ? numverts : numkeys;
-    motion_post = primitive_surface_attribute_float3(kg, sd, desc, NULL, NULL);
-
 #ifdef __HAIR__
-    if (is_curve_primitive && (sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
-      object_position_transform(kg, sd, &motion_pre);
-      object_position_transform(kg, sd, &motion_post);
+    if (is_curve_primitive) {
+      motion_pre = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
+      desc.offset += numkeys;
+      motion_post = float4_to_float3(curve_attribute_float4(kg, sd, desc, NULL, NULL));
+
+      /* Curve */
+      if ((sd->object_flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+        object_position_transform(kg, sd, &motion_pre);
+        object_position_transform(kg, sd, &motion_post);
+      }
    }
+    else
 #endif
+        if (sd->type & PRIMITIVE_ALL_TRIANGLE) {
+      /* Triangle */
+      if (subd_triangle_patch(kg, sd) == ~0) {
+        motion_pre = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+        desc.offset += numverts;
+        motion_post = triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+      }
+      else {
+        motion_pre = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+        desc.offset += numverts;
+        motion_post = subd_triangle_attribute_float3(kg, sd, desc, NULL, NULL);
+      }
+    }
  }

  /* object motion. note that depending on the mesh having motion vectors, this
--- a/intern/cycles/kernel/geom/subd_triangle.h
+++ b/intern/cycles/kernel/geom/subd_triangle.h
@@ -20,13 +20,6 @@

 CCL_NAMESPACE_BEGIN

-/* Patch index for triangle, -1 if not subdivision triangle */
-
-ccl_device_inline uint subd_triangle_patch(KernelGlobals kg, ccl_private const ShaderData *sd)
-{
-  return (sd->prim != PRIM_NONE) ? kernel_tex_fetch(__tri_patch, sd->prim) : ~0;
-}
-
 /* UV coords of triangle within patch */

 ccl_device_inline void subd_triangle_patch_uv(KernelGlobals kg,
@@ -443,8 +436,8 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
    if (dy)
      *dy = make_float3(0.0f, 0.0f, 0.0f);

-    return float4_to_float3(
-        kernel_tex_fetch(__attributes_float3, desc.offset + subd_triangle_patch_face(kg, patch)));
+    return kernel_tex_fetch(__attributes_float3,
+                            desc.offset + subd_triangle_patch_face(kg, patch));
  }
  else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
    float2 uv[3];
@@ -452,10 +445,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,

    uint4 v = subd_triangle_patch_indices(kg, patch);

-    float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.x));
-    float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.y));
-    float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.z));
-    float3 f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + v.w));
+    float3 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
+    float3 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
+    float3 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
+    float3 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
      f1 = (f1 + f0) * 0.5f;
@@ -484,10 +477,10 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,

    float3 f0, f1, f2, f3;

-    f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset));
-    f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset));
-    f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset));
-    f3 = float4_to_float3(kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset));
+    f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
+    f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
+    f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
+    f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
      f1 = (f1 + f0) * 0.5f;
@@ -513,7 +506,7 @@ ccl_device_noinline float3 subd_triangle_attribute_float3(KernelGlobals kg,
    if (dy)
      *dy = make_float3(0.0f, 0.0f, 0.0f);

-    return float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset));
+    return kernel_tex_fetch(__attributes_float3, desc.offset);
  }
  else {
    if (dx)
@@ -590,7 +583,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
    if (dy)
      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

-    return kernel_tex_fetch(__attributes_float3,
+    return kernel_tex_fetch(__attributes_float4,
                            desc.offset + subd_triangle_patch_face(kg, patch));
  }
  else if (desc.element == ATTR_ELEMENT_VERTEX || desc.element == ATTR_ELEMENT_VERTEX_MOTION) {
@@ -599,10 +592,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,

    uint4 v = subd_triangle_patch_indices(kg, patch);

-    float4 f0 = kernel_tex_fetch(__attributes_float3, desc.offset + v.x);
-    float4 f1 = kernel_tex_fetch(__attributes_float3, desc.offset + v.y);
-    float4 f2 = kernel_tex_fetch(__attributes_float3, desc.offset + v.z);
-    float4 f3 = kernel_tex_fetch(__attributes_float3, desc.offset + v.w);
+    float4 f0 = kernel_tex_fetch(__attributes_float4, desc.offset + v.x);
+    float4 f1 = kernel_tex_fetch(__attributes_float4, desc.offset + v.y);
+    float4 f2 = kernel_tex_fetch(__attributes_float4, desc.offset + v.z);
+    float4 f3 = kernel_tex_fetch(__attributes_float4, desc.offset + v.w);

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
      f1 = (f1 + f0) * 0.5f;
@@ -642,10 +635,10 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
          color_uchar4_to_float4(kernel_tex_fetch(__attributes_uchar4, corners[3] + desc.offset)));
    }
    else {
-      f0 = kernel_tex_fetch(__attributes_float3, corners[0] + desc.offset);
-      f1 = kernel_tex_fetch(__attributes_float3, corners[1] + desc.offset);
-      f2 = kernel_tex_fetch(__attributes_float3, corners[2] + desc.offset);
-      f3 = kernel_tex_fetch(__attributes_float3, corners[3] + desc.offset);
+      f0 = kernel_tex_fetch(__attributes_float4, corners[0] + desc.offset);
+      f1 = kernel_tex_fetch(__attributes_float4, corners[1] + desc.offset);
+      f2 = kernel_tex_fetch(__attributes_float4, corners[2] + desc.offset);
+      f3 = kernel_tex_fetch(__attributes_float4, corners[3] + desc.offset);
    }

    if (subd_triangle_patch_num_corners(kg, patch) != 4) {
@@ -672,7 +665,7 @@ ccl_device_noinline float4 subd_triangle_attribute_float4(KernelGlobals kg,
    if (dy)
      *dy = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

-    return kernel_tex_fetch(__attributes_float3, desc.offset);
+    return kernel_tex_fetch(__attributes_float4, desc.offset);
  }
  else {
    if (dx)
--- a/intern/cycles/kernel/geom/triangle.h
+++ b/intern/cycles/kernel/geom/triangle.h
@@ -29,9 +29,9 @@ ccl_device_inline float3 triangle_normal(KernelGlobals kg, ccl_private ShaderDat
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-  const float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  const float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  const float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  const float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  const float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  const float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);

  /* return normal */
  if (sd->object_flag & SD_OBJECT_NEGATIVE_SCALE_APPLIED) {
@@ -54,9 +54,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  float3 v0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  float3 v1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  float3 v2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
  /* compute point */
  float t = 1.0f - u - v;
  *P = (u * v0 + v * v1 + t * v2);
@@ -78,9 +78,9 @@ ccl_device_inline void triangle_point_normal(KernelGlobals kg,
 ccl_device_inline void triangle_vertices(KernelGlobals kg, int prim, float3 P[3])
 {
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
 }

 /* Triangle vertex locations and vertex normals */
@@ -91,12 +91,12 @@ ccl_device_inline void triangle_vertices_and_normals(KernelGlobals kg,
                                                     float3 N[3])
 {
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  P[0] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  P[1] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  P[2] = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
-  N[0] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  N[1] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  N[2] = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  P[0] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  P[1] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  P[2] = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);
+  N[0] = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  N[1] = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  N[2] = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);
 }

 /* Interpolate smooth vertex normal from vertices */
@@ -106,9 +106,9 @@ triangle_smooth_normal(KernelGlobals kg, float3 Ng, int prim, float u, float v)
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);

  float3 N = safe_normalize((1.0f - u - v) * n2 + u * n0 + v * n1);

@@ -120,9 +120,9 @@ ccl_device_inline float3 triangle_smooth_normal_unnormalized(
 {
  /* load triangle vertices */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  float3 n0 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.x));
-  float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
-  float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
+  float3 n0 = kernel_tex_fetch(__tri_vnormal, tri_vindex.x);
+  float3 n1 = kernel_tex_fetch(__tri_vnormal, tri_vindex.y);
+  float3 n2 = kernel_tex_fetch(__tri_vnormal, tri_vindex.z);

  /* ensure that the normals are in object space */
  if (sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED) {
@@ -145,9 +145,9 @@ ccl_device_inline void triangle_dPdudv(KernelGlobals kg,
 {
  /* fetch triangle vertex coordinates */
  const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
-  const float3 p0 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 0));
-  const float3 p1 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 1));
-  const float3 p2 = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex.w + 2));
+  const float3 p0 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 0);
+  const float3 p1 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 1);
+  const float3 p2 = kernel_tex_fetch(__tri_verts, tri_vindex.w + 2);

  /* compute derivatives of P w.r.t. uv */
  *dPdu = (p0 - p2);
@@ -267,15 +267,15 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,

    if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
      const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x));
-      f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y));
-      f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z));
+      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
+      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
+      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
    }
    else {
      const int tri = desc.offset + sd->prim * 3;
-      f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 0));
-      f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 1));
-      f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, tri + 2));
+      f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
+      f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
+      f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
    }

 #ifdef __RAY_DIFFERENTIALS__
@@ -298,7 +298,7 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                               desc.offset;
-      return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset));
+      return kernel_tex_fetch(__attributes_float3, offset);
    }
    else {
      return make_float3(0.0f, 0.0f, 0.0f);
@@ -318,16 +318,16 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,

    if (desc.element & (ATTR_ELEMENT_VERTEX | ATTR_ELEMENT_VERTEX_MOTION)) {
      const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
-      f0 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.x);
-      f1 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.y);
-      f2 = kernel_tex_fetch(__attributes_float3, desc.offset + tri_vindex.z);
+      f0 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.x);
+      f1 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.y);
+      f2 = kernel_tex_fetch(__attributes_float4, desc.offset + tri_vindex.z);
    }
    else {
      const int tri = desc.offset + sd->prim * 3;
      if (desc.element == ATTR_ELEMENT_CORNER) {
-        f0 = kernel_tex_fetch(__attributes_float3, tri + 0);
-        f1 = kernel_tex_fetch(__attributes_float3, tri + 1);
-        f2 = kernel_tex_fetch(__attributes_float3, tri + 2);
+        f0 = kernel_tex_fetch(__attributes_float4, tri + 0);
+        f1 = kernel_tex_fetch(__attributes_float4, tri + 1);
+        f2 = kernel_tex_fetch(__attributes_float4, tri + 2);
      }
      else {
        f0 = color_srgb_to_linear_v4(
@@ -359,7 +359,7 @@ ccl_device float4 triangle_attribute_float4(KernelGlobals kg,
    if (desc.element & (ATTR_ELEMENT_FACE | ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
      const int offset = (desc.element == ATTR_ELEMENT_FACE) ? desc.offset + sd->prim :
                                                               desc.offset;
-      return kernel_tex_fetch(__attributes_float3, offset);
+      return kernel_tex_fetch(__attributes_float4, offset);
    }
    else {
      return make_float4(0.0f, 0.0f, 0.0f, 0.0f);
--- a/intern/cycles/kernel/geom/triangle_intersect.h
+++ b/intern/cycles/kernel/geom/triangle_intersect.h
@@ -37,27 +37,11 @@ ccl_device_inline bool triangle_intersect(KernelGlobals kg,
 {
  const int prim = kernel_tex_fetch(__prim_index, prim_addr);
  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex];
-#else
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
-#endif
  float t, u, v;
-  if (ray_triangle_intersect(P,
-                             dir,
-                             tmax,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                             ssef_verts,
-#else
-                             float4_to_float3(tri_a),
-                             float4_to_float3(tri_b),
-                             float4_to_float3(tri_c),
-#endif
-                             &u,
-                             &v,
-                             &t)) {
+  if (ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
 #ifdef __VISIBILITY_FLAG__
    /* Visibility flag test. we do it here under the assumption
     * that most triangles are culled by node flags.
@@ -106,27 +90,11 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,

  const int prim = kernel_tex_fetch(__prim_index, prim_addr);
  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, prim).w;
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const ssef *ssef_verts = (ssef *)&kg->__tri_verts.data[tri_vindex];
-#  else
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)),
-               tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)),
-               tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
-#  endif
+  const float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  float t, u, v;
-  if (!ray_triangle_intersect(P,
-                              dir,
-                              tmax,
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                              ssef_verts,
-#  else
-                              tri_a,
-                              tri_b,
-                              tri_c,
-#  endif
-                              &u,
-                              &v,
-                              &t)) {
+  if (!ray_triangle_intersect(P, dir, tmax, tri_a, tri_b, tri_c, &u, &v, &t)) {
    return false;
  }

@@ -178,11 +146,6 @@ ccl_device_inline bool triangle_intersect_local(KernelGlobals kg,
  isect->t = t;

  /* Record geometric normal. */
-#  if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 0)),
-               tri_b = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 1)),
-               tri_c = float4_to_float3(kernel_tex_fetch(__tri_verts, tri_vindex + 2));
-#  endif
  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));

  return false;
@@ -223,9 +186,9 @@ ccl_device_inline float3 triangle_refine(KernelGlobals kg,
  P = P + D * t;

  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+  const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+                      tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+                      tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
  float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
  float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
@@ -280,9 +243,9 @@ ccl_device_inline float3 triangle_refine_local(KernelGlobals kg,

 #  ifdef __INTERSECTION_REFINE__
  const uint tri_vindex = kernel_tex_fetch(__tri_vindex, isect_prim).w;
-  const float4 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
-               tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
-               tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
+  const packed_float3 tri_a = kernel_tex_fetch(__tri_verts, tri_vindex + 0),
+                      tri_b = kernel_tex_fetch(__tri_verts, tri_vindex + 1),
+                      tri_c = kernel_tex_fetch(__tri_verts, tri_vindex + 2);
  float3 edge1 = make_float3(tri_a.x - tri_c.x, tri_a.y - tri_c.y, tri_a.z - tri_c.z);
  float3 edge2 = make_float3(tri_b.x - tri_c.x, tri_b.y - tri_c.y, tri_b.z - tri_c.z);
  float3 tvec = make_float3(P.x - tri_c.x, P.y - tri_c.y, P.z - tri_c.z);
--- a/intern/cycles/kernel/geom/volume.h
+++ b/intern/cycles/kernel/geom/volume.h
@@ -75,7 +75,7 @@ ccl_device float4 volume_attribute_float4(KernelGlobals kg,
                                          const AttributeDescriptor desc)
 {
  if (desc.element & (ATTR_ELEMENT_OBJECT | ATTR_ELEMENT_MESH)) {
-    return kernel_tex_fetch(__attributes_float3, desc.offset);
+    return kernel_tex_fetch(__attributes_float4, desc.offset);
  }
  else if (desc.element == ATTR_ELEMENT_VOXEL) {
    /* todo: optimize this so we don't have to transform both here and in
--- a/intern/cycles/kernel/integrator/init_from_bake.h
+++ b/intern/cycles/kernel/integrator/init_from_bake.h
@@ -65,7 +65,8 @@ ccl_device bool integrator_init_from_bake(KernelGlobals kg,
  }

  /* Always count the sample, even if the camera sample will reject the ray. */
-  const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample);
+  const int sample = kernel_accum_sample(
+      kg, state, render_buffer, scheduled_sample, tile->sample_offset);

  /* Setup render buffers. */
  const int index = INTEGRATOR_STATE(state, path, render_pixel_index);
--- a/intern/cycles/kernel/integrator/init_from_camera.h
+++ b/intern/cycles/kernel/integrator/init_from_camera.h
@@ -89,7 +89,8 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
   * This logic allows to both count actual number of samples per pixel, and to add samples to this
   * pixel after it was converged and samples were added somewhere else (in which case the
   * `scheduled_sample` will be different from actual number of samples in this pixel). */
-  const int sample = kernel_accum_sample(kg, state, render_buffer, scheduled_sample);
+  const int sample = kernel_accum_sample(
+      kg, state, render_buffer, scheduled_sample, tile->sample_offset);

  /* Initialize random number seed for path. */
  const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
--- a/intern/cycles/kernel/integrator/shade_background.h
+++ b/intern/cycles/kernel/integrator/shade_background.h
@@ -20,7 +20,6 @@
 #include "kernel/integrator/shader_eval.h"
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"
-#include "kernel/sample/mis.h"

 CCL_NAMESPACE_BEGIN

@@ -81,8 +80,7 @@ ccl_device float3 integrator_eval_background_shader(KernelGlobals kg,
    /* multiple importance sampling, get background light pdf for ray
     * direction, and compute weight with respect to BSDF pdf */
    const float pdf = background_light_pdf(kg, ray_P - ray_D * mis_ray_t, ray_D);
-    const float mis_weight = power_heuristic(mis_ray_pdf, pdf);
-
+    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, pdf);
    L *= mis_weight;
  }
 #  endif
@@ -169,7 +167,7 @@ ccl_device_inline void integrate_distant_lights(KernelGlobals kg,
        /* multiple importance sampling, get regular light pdf,
         * and compute weight with respect to BSDF pdf */
        const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-        const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+        const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
        light_eval *= mis_weight;
      }

--- a/intern/cycles/kernel/integrator/shade_light.h
+++ b/intern/cycles/kernel/integrator/shade_light.h
@@ -84,7 +84,7 @@ ccl_device_inline void integrate_light(KernelGlobals kg,
    /* multiple importance sampling, get regular light pdf,
     * and compute weight with respect to BSDF pdf */
    const float mis_ray_pdf = INTEGRATOR_STATE(state, path, mis_ray_pdf);
-    const float mis_weight = power_heuristic(mis_ray_pdf, ls.pdf);
+    const float mis_weight = light_sample_mis_weight_forward(kg, mis_ray_pdf, ls.pdf);
    light_eval *= mis_weight;
  }

--- a/intern/cycles/kernel/integrator/shade_shadow.h
+++ b/intern/cycles/kernel/integrator/shade_shadow.h
@@ -95,8 +95,8 @@ ccl_device_inline void integrate_transparent_volume_shadow(KernelGlobals kg,

  shader_setup_from_volume(kg, shadow_sd, &ray);

-  const float step_size = volume_stack_step_size(
-      kg, [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); });
+  VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i));
+  const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass);

  volume_shadow_heterogeneous(kg, state, &ray, shadow_sd, throughput, step_size);
 }
--- a/intern/cycles/kernel/integrator/shade_surface.h
+++ b/intern/cycles/kernel/integrator/shade_surface.h
@@ -27,8 +27,6 @@
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"

-#include "kernel/sample/mis.h"
-
 CCL_NAMESPACE_BEGIN

 ccl_device_forceinline void integrate_surface_shader_setup(KernelGlobals kg,
@@ -95,8 +93,7 @@ ccl_device_forceinline void integrate_surface_emission(KernelGlobals kg,
    /* Multiple importance sampling, get triangle light pdf,
     * and compute weight with respect to BSDF pdf. */
    float pdf = triangle_light_pdf(kg, sd, t);
-    float mis_weight = power_heuristic(bsdf_pdf, pdf);
-
+    float mis_weight = light_sample_mis_weight_forward(kg, bsdf_pdf, pdf);
    L *= mis_weight;
  }

@@ -155,7 +152,7 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
  bsdf_eval_mul3(&bsdf_eval, light_eval / ls.pdf);

  if (ls.shader & SHADER_USE_MIS) {
-    const float mis_weight = power_heuristic(ls.pdf, bsdf_pdf);
+    const float mis_weight = light_sample_mis_weight_nee(kg, ls.pdf, bsdf_pdf);
    bsdf_eval_mul(&bsdf_eval, mis_weight);
  }

@@ -195,12 +192,13 @@ ccl_device_forceinline void integrate_surface_direct_light(KernelGlobals kg,
  const float3 throughput = INTEGRATOR_STATE(state, path, throughput) * bsdf_eval_sum(&bsdf_eval);

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 pass_diffuse_weight = (bounce == 0) ?
-                                           bsdf_eval_pass_diffuse_weight(&bsdf_eval) :
-                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
-    const float3 pass_glossy_weight = (bounce == 0) ?
-                                          bsdf_eval_pass_glossy_weight(&bsdf_eval) :
-                                          INTEGRATOR_STATE(state, path, pass_glossy_weight);
+    const packed_float3 pass_diffuse_weight =
+        (bounce == 0) ? packed_float3(bsdf_eval_pass_diffuse_weight(&bsdf_eval)) :
+                        INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+    const packed_float3 pass_glossy_weight = (bounce == 0) ?
+                                                 packed_float3(
+                                                     bsdf_eval_pass_glossy_weight(&bsdf_eval)) :
+                                                 INTEGRATOR_STATE(state, path, pass_glossy_weight);
    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = pass_glossy_weight;
  }
--- a/intern/cycles/kernel/integrator/shade_volume.h
+++ b/intern/cycles/kernel/integrator/shade_volume.h
@@ -27,8 +27,6 @@
 #include "kernel/light/light.h"
 #include "kernel/light/sample.h"

-#include "kernel/sample/mis.h"
-
 CCL_NAMESPACE_BEGIN

 #ifdef __VOLUME__
@@ -78,9 +76,8 @@ ccl_device_inline bool shadow_volume_shader_sample(KernelGlobals kg,
                                                   ccl_private ShaderData *ccl_restrict sd,
                                                   ccl_private float3 *ccl_restrict extinction)
 {
-  shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, [=](const int i) {
-    return integrator_state_read_shadow_volume_stack(state, i);
-  });
+  VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i))
+  shader_eval_volume<true>(kg, state, sd, PATH_RAY_SHADOW, volume_read_lambda_pass);

  if (!(sd->flag & SD_EXTINCTION)) {
    return false;
@@ -98,9 +95,8 @@ ccl_device_inline bool volume_shader_sample(KernelGlobals kg,
                                            ccl_private VolumeShaderCoefficients *coeff)
 {
  const uint32_t path_flag = INTEGRATOR_STATE(state, path, flag);
-  shader_eval_volume<false>(kg, state, sd, path_flag, [=](const int i) {
-    return integrator_state_read_volume_stack(state, i);
-  });
+  VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+  shader_eval_volume<false>(kg, state, sd, path_flag, volume_read_lambda_pass);

  if (!(sd->flag & (SD_EXTINCTION | SD_SCATTER | SD_EMISSION))) {
    return false;
@@ -772,7 +768,7 @@ ccl_device_forceinline void integrate_volume_direct_light(
  const float phase_pdf = shader_volume_phase_eval(kg, sd, phases, ls->D, &phase_eval);

  if (ls->shader & SHADER_USE_MIS) {
-    float mis_weight = power_heuristic(ls->pdf, phase_pdf);
+    float mis_weight = light_sample_mis_weight_nee(kg, ls->pdf, phase_pdf);
    bsdf_eval_mul(&phase_eval, mis_weight);
  }

@@ -805,9 +801,10 @@ ccl_device_forceinline void integrate_volume_direct_light(
  const float3 throughput_phase = throughput * bsdf_eval_sum(&phase_eval);

  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
-    const float3 pass_diffuse_weight = (bounce == 0) ?
-                                           one_float3() :
-                                           INTEGRATOR_STATE(state, path, pass_diffuse_weight);
+    const packed_float3 pass_diffuse_weight = (bounce == 0) ?
+                                                  packed_float3(one_float3()) :
+                                                  INTEGRATOR_STATE(
+                                                      state, path, pass_diffuse_weight);
    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_diffuse_weight) = pass_diffuse_weight;
    INTEGRATOR_STATE_WRITE(shadow_state, shadow_path, pass_glossy_weight) = zero_float3();
  }
@@ -932,8 +929,8 @@ ccl_device VolumeIntegrateEvent volume_integrate(KernelGlobals kg,
                                                VOLUME_SAMPLE_DISTANCE;

  /* Step through volume. */
-  const float step_size = volume_stack_step_size(
-      kg, [=](const int i) { return integrator_state_read_volume_stack(state, i); });
+  VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+  const float step_size = volume_stack_step_size(kg, volume_read_lambda_pass);

  /* TODO: expensive to zero closures? */
  VolumeIntegrateResult result = {};
--- a/intern/cycles/kernel/integrator/shadow_state_template.h
+++ b/intern/cycles/kernel/integrator/shadow_state_template.h
@@ -40,15 +40,15 @@ KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, queued_kernel, KERNEL_FEATURE_PATH_T
 /* enum PathRayFlag */
 KERNEL_STRUCT_MEMBER(shadow_path, uint32_t, flag, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput for shadow pass. */
 KERNEL_STRUCT_MEMBER(shadow_path,
-                     float3,
+                     packed_float3,
                     unshadowed_throughput,
                     KERNEL_FEATURE_SHADOW_PASS | KERNEL_FEATURE_AO_ADDITIVE)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(shadow_path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(shadow_path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Number of intersections found by ray-tracing. */
 KERNEL_STRUCT_MEMBER(shadow_path, uint16_t, num_hits, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_END(shadow_path)
@@ -56,8 +56,8 @@ KERNEL_STRUCT_END(shadow_path)
 /********************************** Shadow Ray *******************************/

 KERNEL_STRUCT_BEGIN(shadow_ray)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(shadow_ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(shadow_ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, t, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(shadow_ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
--- a/intern/cycles/kernel/integrator/state_template.h
+++ b/intern/cycles/kernel/integrator/state_template.h
@@ -59,12 +59,12 @@ KERNEL_STRUCT_MEMBER(path, float, min_ray_pdf, KERNEL_FEATURE_PATH_TRACING)
 /* Continuation probability for path termination. */
 KERNEL_STRUCT_MEMBER(path, float, continuation_probability, KERNEL_FEATURE_PATH_TRACING)
 /* Throughput. */
-KERNEL_STRUCT_MEMBER(path, float3, throughput, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(path, packed_float3, throughput, KERNEL_FEATURE_PATH_TRACING)
 /* Ratio of throughput to distinguish diffuse / glossy / transmission render passes. */
-KERNEL_STRUCT_MEMBER(path, float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
-KERNEL_STRUCT_MEMBER(path, float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, packed_float3, pass_diffuse_weight, KERNEL_FEATURE_LIGHT_PASSES)
+KERNEL_STRUCT_MEMBER(path, packed_float3, pass_glossy_weight, KERNEL_FEATURE_LIGHT_PASSES)
 /* Denoising. */
-KERNEL_STRUCT_MEMBER(path, float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
+KERNEL_STRUCT_MEMBER(path, packed_float3, denoising_feature_throughput, KERNEL_FEATURE_DENOISING)
 /* Shader sorting. */
 /* TODO: compress as uint16? or leave out entirely and recompute key in sorting code? */
 KERNEL_STRUCT_MEMBER(path, uint32_t, shader_sort_key, KERNEL_FEATURE_PATH_TRACING)
@@ -73,8 +73,8 @@ KERNEL_STRUCT_END(path)
 /************************************** Ray ***********************************/

 KERNEL_STRUCT_BEGIN(ray)
-KERNEL_STRUCT_MEMBER(ray, float3, P, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_MEMBER(ray, float3, D, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING)
+KERNEL_STRUCT_MEMBER(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, t, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, time, KERNEL_FEATURE_PATH_TRACING)
 KERNEL_STRUCT_MEMBER(ray, float, dP, KERNEL_FEATURE_PATH_TRACING)
@@ -96,10 +96,10 @@ KERNEL_STRUCT_END(isect)
 /*************** Subsurface closure state for subsurface kernel ***************/

 KERNEL_STRUCT_BEGIN(subsurface)
-KERNEL_STRUCT_MEMBER(subsurface, float3, albedo, KERNEL_FEATURE_SUBSURFACE)
-KERNEL_STRUCT_MEMBER(subsurface, float3, radius, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, albedo, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, radius, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_MEMBER(subsurface, float, anisotropy, KERNEL_FEATURE_SUBSURFACE)
-KERNEL_STRUCT_MEMBER(subsurface, float3, Ng, KERNEL_FEATURE_SUBSURFACE)
+KERNEL_STRUCT_MEMBER(subsurface, packed_float3, Ng, KERNEL_FEATURE_SUBSURFACE)
 KERNEL_STRUCT_END(subsurface)

 /********************************** Volume Stack ******************************/
--- a/intern/cycles/kernel/integrator/volume_stack.h
+++ b/intern/cycles/kernel/integrator/volume_stack.h
@@ -18,6 +18,14 @@

 CCL_NAMESPACE_BEGIN

+/* Volumetric read/write lambda functions - default implementations */
+#ifndef VOLUME_READ_LAMBDA
+#  define VOLUME_READ_LAMBDA(function_call) \
+    auto volume_read_lambda_pass = [=](const int i) { return function_call; };
+#  define VOLUME_WRITE_LAMBDA(function_call) \
+    auto volume_write_lambda_pass = [=](const int i, VolumeStack entry) { function_call; };
+#endif
+
 /* Volume Stack
 *
 * This is an array of object/shared ID's that the current segment of the path
@@ -88,26 +96,18 @@ ccl_device void volume_stack_enter_exit(KernelGlobals kg,
                                        IntegratorState state,
                                        ccl_private const ShaderData *sd)
 {
-  volume_stack_enter_exit(
-      kg,
-      sd,
-      [=](const int i) { return integrator_state_read_volume_stack(state, i); },
-      [=](const int i, const VolumeStack entry) {
-        integrator_state_write_volume_stack(state, i, entry);
-      });
+  VOLUME_READ_LAMBDA(integrator_state_read_volume_stack(state, i))
+  VOLUME_WRITE_LAMBDA(integrator_state_write_volume_stack(state, i, entry))
+  volume_stack_enter_exit(kg, sd, volume_read_lambda_pass, volume_write_lambda_pass);
 }

 ccl_device void shadow_volume_stack_enter_exit(KernelGlobals kg,
                                               IntegratorShadowState state,
                                               ccl_private const ShaderData *sd)
 {
-  volume_stack_enter_exit(
-      kg,
-      sd,
-      [=](const int i) { return integrator_state_read_shadow_volume_stack(state, i); },
-      [=](const int i, const VolumeStack entry) {
-        integrator_state_write_shadow_volume_stack(state, i, entry);
-      });
+  VOLUME_READ_LAMBDA(integrator_state_read_shadow_volume_stack(state, i))
+  VOLUME_WRITE_LAMBDA(integrator_state_write_shadow_volume_stack(state, i, entry))
+  volume_stack_enter_exit(kg, sd, volume_read_lambda_pass, volume_write_lambda_pass);
 }

 /* Clean stack after the last bounce.
--- a/intern/cycles/kernel/light/light.h
+++ b/intern/cycles/kernel/light/light.h
@@ -676,19 +676,7 @@ ccl_device_forceinline void triangle_light_sample(KernelGlobals kg,
    ls->D = z * B + safe_sqrtf(1.0f - z * z) * safe_normalize(C_ - dot(C_, B) * B);

    /* calculate intersection with the planar triangle */
-    if (!ray_triangle_intersect(P,
-                                ls->D,
-                                FLT_MAX,
-#if defined(__KERNEL_SSE2__) && defined(__KERNEL_SSE__)
-                                (ssef *)V,
-#else
-                                V[0],
-                                V[1],
-                                V[2],
-#endif
-                                &ls->u,
-                                &ls->v,
-                                &ls->t)) {
+    if (!ray_triangle_intersect(P, ls->D, FLT_MAX, V[0], V[1], V[2], &ls->u, &ls->v, &ls->t)) {
      ls->pdf = 0.0f;
      return;
    }
--- a/intern/cycles/kernel/light/sample.h
+++ b/intern/cycles/kernel/light/sample.h
@@ -22,6 +22,7 @@
 #include "kernel/light/light.h"

 #include "kernel/sample/mapping.h"
+#include "kernel/sample/mis.h"

 CCL_NAMESPACE_BEGIN

@@ -268,4 +269,36 @@ ccl_device_inline void light_sample_to_volume_shadow_ray(
  shadow_ray_setup(sd, ls, P, ray);
 }

+ccl_device_inline float light_sample_mis_weight_forward(KernelGlobals kg,
+                                                        const float forward_pdf,
+                                                        const float nee_pdf)
+{
+#ifdef WITH_CYCLES_DEBUG
+  if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_FORWARD) {
+    return 1.0f;
+  }
+  else if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_NEE) {
+    return 0.0f;
+  }
+  else
+#endif
+    return power_heuristic(forward_pdf, nee_pdf);
+}
+
+ccl_device_inline float light_sample_mis_weight_nee(KernelGlobals kg,
+                                                    const float nee_pdf,
+                                                    const float forward_pdf)
+{
+#ifdef WITH_CYCLES_DEBUG
+  if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_FORWARD) {
+    return 0.0f;
+  }
+  else if (kernel_data.integrator.direct_light_sampling_type == DIRECT_LIGHT_SAMPLING_NEE) {
+    return 1.0f;
+  }
+  else
+#endif
+    return power_heuristic(nee_pdf, forward_pdf);
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/osl/CMakeLists.txt
+++ b/intern/cycles/kernel/osl/CMakeLists.txt
@@ -55,7 +55,7 @@ if(APPLE)
  # Disable allocation warning on macOS prior to 10.14: the OSLRenderServices
  # contains member which is 64 bytes aligned (cache inside of OIIO's
  # unordered_map_concurrent). This is not something what the SDK supportsm, but
-  # since we take care of allocations ourselves is is OK to ignore the
+  # since we take care of allocations ourselves is OK to ignore the
  # diagnostic message.
  string(APPEND CMAKE_CXX_FLAGS " -faligned-allocation")
 endif()
--- a/Show More
+++ b/Show More