2023-08-08 17:56:17 +02:00 · 2023-08-08 17:56:57 +02:00 · 2023-08-08 17:57:23 +02:00 · 2023-08-08 18:00:27 +02:00 · 2023-08-08 18:01:25 +02:00 · 2023-08-08 18:02:42 +02:00
22 changed files with 350 additions and 99 deletions
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@ -978,6 +978,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
        min=8, max=8192,
    )

+    interleaved_slices: BoolProperty(
+        name="Interleaved Slices",
+        default=True,
+        description="If true work is distribuited as many small interleaved work slices so as to distribute the work more evenly otherwise a single big slice per device is used"
+    )
+
    # Various fine-tuning debug flags

    def _devices_update_callback(self, context):
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@ -866,6 +866,22 @@ class CYCLES_RENDER_PT_performance_viewport(CyclesButtonsPanel, Panel):
        col.prop(rd, "preview_pixel_size", text="Pixel Size")


+class CYCLES_RENDER_PT_performance_multiple_device(CyclesButtonsPanel, Panel):
+    bl_label = "Multiple Device"
+    bl_parent_id = "CYCLES_RENDER_PT_performance"
+
+    def draw(self, context):
+        layout = self.layout
+        layout.use_property_split = True
+        layout.use_property_decorate = False
+
+        scene = context.scene
+        cscene = scene.cycles
+
+        col = layout.column()
+        col.prop(cscene, "interleaved_slices")
+
+
 class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
    bl_label = "Filter"
    bl_options = {'DEFAULT_CLOSED'}
@ -2526,6 +2542,7 @@ classes = (
    CYCLES_RENDER_PT_performance_acceleration_structure,
    CYCLES_RENDER_PT_performance_final_render,
    CYCLES_RENDER_PT_performance_viewport,
+    CYCLES_RENDER_PT_performance_multiple_device,
    CYCLES_RENDER_PT_passes,
    CYCLES_RENDER_PT_passes_data,
    CYCLES_RENDER_PT_passes_light,
--- a/intern/cycles/blender/sync.cpp
+++ b/intern/cycles/blender/sync.cpp
@ -851,6 +851,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
  SessionParams params;
  PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");

+  params.interleaved_slices = get_boolean(cscene, "interleaved_slices");
+
  if (background && !b_engine.is_preview()) {
    /* Viewport and preview renders do not require temp directory and do request session
     * parameters more often than the background render.
--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@ -13,6 +13,7 @@

 #  include "device/metal/bvh.h"
 #  include "device/metal/util.h"
+#  include "device/device.h"

 CCL_NAMESPACE_BEGIN

--- a/intern/cycles/integrator/denoiser_gpu.cpp
+++ b/intern/cycles/integrator/denoiser_gpu.cpp
@ -81,8 +81,12 @@ bool DenoiserGPU::denoise_buffer(const BufferParams &buffer_params,
  if (local_buffer_used) {
    local_render_buffers.copy_from_device();

-    render_buffers_host_copy_denoised(
-        render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params);
+    render_buffers_host_copy_denoised(render_buffers,
+                                      buffer_params,
+                                      0,
+                                      buffer_params.height,
+                                      &local_render_buffers,
+                                      local_render_buffers.params);

    render_buffers->copy_to_device();
  }
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@ -88,21 +88,30 @@ static void pad_pixels(const BufferParams &buffer_params,
    return;
  }

-  const size_t size = static_cast<size_t>(buffer_params.width) * buffer_params.height;
+  const int width = buffer_params.width;
+  const int slice_stride = buffer_params.slice_stride;
+  const int slice_height = buffer_params.slice_height;
+  const int total_height = buffer_params.window_height;
  if (destination.pixels) {
    const size_t pixel_stride = destination.pixel_stride ? destination.pixel_stride :
                                                           destination.num_components;

    float *pixel = destination.pixels + pixel_stride * destination.offset;

-    for (size_t i = 0; i < size; i++, pixel += dest_num_components) {
-      if (dest_num_components >= 3 && src_num_components == 1) {
-        pixel[1] = pixel[0];
-        pixel[2] = pixel[0];
-      }
-      if (dest_num_components >= 4) {
-        pixel[3] = 1.0f;
+    for (int slice_y = 0; slice_y < total_height; slice_y += slice_height) {
+      float *dst = pixel;
+      const int height = std::min(slice_height, total_height - slice_y);
+      const int size = width * height;
+      for (size_t i = 0; i < size; i++, dst += dest_num_components) {
+        if (dest_num_components >= 3 && src_num_components == 1) {
+          dst[1] = dst[0];
+          dst[2] = dst[0];
+        }
+        if (dest_num_components >= 4) {
+          dst[3] = 1.0f;
+        }
      }
+      pixel += slice_stride * width * dest_num_components;
    }
  }

@ -110,14 +119,20 @@ static void pad_pixels(const BufferParams &buffer_params,
    const half one = float_to_half_display(1.0f);
    half4 *pixel = destination.pixels_half_rgba + destination.offset;

-    for (size_t i = 0; i < size; i++, pixel++) {
-      if (dest_num_components >= 3 && src_num_components == 1) {
-        pixel[0].y = pixel[0].x;
-        pixel[0].z = pixel[0].x;
-      }
-      if (dest_num_components >= 4) {
-        pixel[0].w = one;
+    for (int slice_y = 0; slice_y < total_height; slice_y += slice_height) {
+      half4 *dst = pixel;
+      const int height = std::min(slice_height, total_height - slice_y);
+      const int size = width * height;
+      for (size_t i = 0; i < size; i++, dst++) {
+        if (dest_num_components >= 3 && src_num_components == 1) {
+          dst[0].y = dst[0].x;
+          dst[0].z = dst[0].x;
+        }
+        if (dest_num_components >= 4) {
+          dst[0].w = one;
+        }
      }
+      pixel += slice_stride * width;
    }
  }
 }
@ -298,7 +313,6 @@ bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const S
  const BufferParams &buffer_params = render_buffers->params;

  float *buffer_data = render_buffers->buffer.data();
-  const int size = buffer_params.width * buffer_params.height;

  const int out_stride = buffer_params.pass_stride;
  const int in_stride = source.num_components;
@ -306,11 +320,18 @@ bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const S

  float *out = buffer_data + pass_access_info_.offset;
  const float *in = source.pixels + source.offset * in_stride;
-
-  for (int i = 0; i < size; i++, out += out_stride, in += in_stride) {
-    memcpy(out, in, sizeof(float) * num_components_to_copy);
+  const int slice_height = buffer_params.slice_height;
+  const int total_height = buffer_params.height;
+  const int slice_stride = buffer_params.slice_stride;
+  for (int y = 0; y < total_height; y += slice_height) {
+    const int height = std::min(slice_height, total_height - y);
+    const float *src = in;
+    const int size = height * buffer_params.width;
+    for (int i = 0; i < size; i++, out += out_stride, src += in_stride) {
+      memcpy(out, src, sizeof(float) * num_components_to_copy);
+    }
+    in += slice_stride * buffer_params.width * in_stride;
  }
-
  return true;
 }

--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@ -45,11 +45,38 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
  const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
                                                      destination.num_components;

-  parallel_for(0, buffer_params.window_height, [&](int64_t y) {
+  /* Calculate how many full plus partial slices there are */
+  int slice_height;
+  int slice_stride;
+  int slices;
+  if(buffer_params.slice_height > 0) {
+    /* Copy over each slice */
+    slices = buffer_params.window_height/buffer_params.slice_height;
+    slices += (slices*buffer_params.slice_height < buffer_params.window_height) ? 1 : 0;
+    slice_height = buffer_params.slice_height;
+    slice_stride = buffer_params.slice_stride;
+  }
+  else {
+    /* Assign each row to a slice */
+    slices = buffer_params.window_height;
+    slice_height = 1;
+    slice_stride = 1;
+  }
+
+  /* Copy over each slice to the destination */
+  parallel_for(0, slices, [&](int slice) {
+  //for(int slice = 0;slice < slices;++slice) {
+    int y = slice*slice_height;
    const float *buffer = window_data + y * buffer_row_stride;
-    float *pixel = destination.pixels +
-                   (y * buffer_params.width + destination.offset) * pixel_stride;
-    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride);
+    int height = std::min(slice_height, buffer_params.window_height - y);
+    int pixel_y = slice * slice_stride;
+    float *pixels = destination.pixels +
+                    (pixel_y * buffer_params.width + destination.offset) * pixel_stride;
+    for (int row = 0; row < height;
+         row++, buffer += buffer_row_stride, pixels += buffer_params.width * pixel_stride)
+    {
+      func(kfilm_convert, buffer, pixels, buffer_params.window_width, pass_stride, pixel_stride);
+    }
  });
 }

@ -70,10 +97,35 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
  const int destination_stride = destination.stride != 0 ? destination.stride :
                                                           buffer_params.width;

-  parallel_for(0, buffer_params.window_height, [&](int64_t y) {
+  /* Calculate how many full plus partial slices there are */
+  int slice_height;
+  int slice_stride;
+  int slices;
+  if(buffer_params.slice_height > 0) {
+    /* Copy over each slice */
+    slices = buffer_params.window_height/buffer_params.slice_height;
+    slices += (slices*buffer_params.slice_height < buffer_params.window_height) ? 1 : 0;
+    slice_height = buffer_params.slice_height;
+    slice_stride = buffer_params.slice_stride;
+  }
+  else {
+    /* Assign each row to a slice */
+    slices = buffer_params.window_height;
+    slice_height = 1;
+    slice_stride = 1;
+  }
+  
+  parallel_for(0, slices, [&](int slice) {
+  //for (int slice = 0; slice < slices; ++slice) {
+    int y = slice*slice_height;
    const float *buffer = window_data + y * buffer_row_stride;
-    half4 *pixel = dst_start + y * destination_stride;
-    func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride);
+    int height = std::min(slice_height, buffer_params.window_height - y);
+    int pixel_y = slice * buffer_params.slice_stride;
+    half4 *pixels = dst_start + pixel_y * destination_stride;
+    for (int row = 0; row < height;
+         row++, buffer += buffer_row_stride, pixels += destination_stride) {
+      func(kfilm_convert, buffer, pixels, buffer_params.window_width, pass_stride);
+    }
  });
 }

--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@ -50,7 +50,6 @@ PathTrace::PathTrace(Device *device,
      path_trace_works_.emplace_back(std::move(work));
    }
  });
-
  work_balance_infos_.resize(path_trace_works_.size());
  work_balance_do_initial(work_balance_infos_);

@ -246,42 +245,92 @@ static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>>
                                         const vector<WorkBalanceInfo> &work_balance_infos,
                                         const BufferParams &buffer_params,
                                         const int overscan,
+                                         const bool interleaved_slices,
                                         const Callback &callback)
 {
  const int num_works = path_trace_works.size();
  const int window_height = buffer_params.window_height;

+  /* Find the largest and smallest weights */
+  int largest_weight = 0;
+  int smallest_weight = 0;
+  for (int i = 0; i < num_works; i++) {
+    double weight = work_balance_infos[i].weight;
+    if (weight > work_balance_infos[largest_weight].weight) {
+      largest_weight = i;
+    }
+    if (weight < work_balance_infos[smallest_weight].weight) {
+      smallest_weight = i;
+    }
+  }
+
+  int biggest_slice = work_balance_infos[largest_weight].weight /
+                      work_balance_infos[smallest_weight].weight;
+  int slice_stride = 0;
+  int slice_sizes[num_works];
+  int remaining_rows = window_height;
+  double allocatable_slices;
+  double fixed_slices;
+  bool use_interleaved = (biggest_slice < (window_height / num_works)) && interleaved_slices;
+  if (use_interleaved) {
+    /* Assign the slices entirely based on the weight */
+    allocatable_slices = 1.0 / work_balance_infos[smallest_weight].weight;
+    fixed_slices = 0.5; /* Round up so that smallest always gets 1 */
+  }
+  else {
+    /* Instead of using interleaved slices create n bigger consecutive slices */
+    /* Aim to acheive at least 1 slice per device otherwise use consecutive slices */
+    allocatable_slices = remaining_rows - num_works; /* each slice must have at least 1 row */
+    fixed_slices = 1; /* Make sure all slices get at least 1  */
+  }
+
+  /* Assign a size to each slice based on its weight */
+  for (int i = 0; i < num_works; i++) {
+    int slice_size = std::floor(work_balance_infos[i].weight * allocatable_slices + fixed_slices);
+    slice_sizes[i] = slice_size;
+    slice_stride += slice_size;
+  }
+
+  /* If there are any remaining scanlines due to truncation add them to the device with the
+   * highest weight */
+  int leftover_scanlines = allocatable_slices - slice_stride;
+  if (leftover_scanlines > 0) {
+    slice_sizes[largest_weight] += leftover_scanlines;
+    slice_stride++;
+  } else if(leftover_scanlines < 0) {
+    VLOG_WARNING << "#######Used to many scanlines";
+  }
+
+  VLOG_INFO << "===================SLICE allocatable:" << allocatable_slices << " fixed:"<< fixed_slices <<  "================";
+  int slices = window_height / slice_stride;
  int current_y = 0;
  for (int i = 0; i < num_works; ++i) {
-    const double weight = work_balance_infos[i].weight;
    const int slice_window_full_y = buffer_params.full_y + buffer_params.window_y + current_y;
-    const int slice_window_height = max(lround(window_height * weight), 1);
-
-    /* Disallow negative values to deal with situations when there are more compute devices than
-     * scan-lines. */
-    const int remaining_window_height = max(0, window_height - current_y);
+    const int slice_left_at_end = std::max(0, window_height - slices * slice_stride - current_y);
+    const int slice_window_height =
+        slices * slice_sizes[i] +
+        std::min(slice_sizes[i], slice_left_at_end);

    BufferParams slice_params = buffer_params;

    slice_params.full_y = max(slice_window_full_y - overscan, buffer_params.full_y);
    slice_params.window_y = slice_window_full_y - slice_params.full_y;

-    if (i < num_works - 1) {
-      slice_params.window_height = min(slice_window_height, remaining_window_height);
-    }
-    else {
-      slice_params.window_height = remaining_window_height;
-    }
+    slice_params.window_height = slice_window_height;

    slice_params.height = slice_params.window_y + slice_params.window_height + overscan;
    slice_params.height = min(slice_params.height,
                              buffer_params.height + buffer_params.full_y - slice_params.full_y);

+    slice_params.slice_height = slice_sizes[i];
+    slice_params.slice_stride = slice_stride;
+
    slice_params.update_offset_stride();

    callback(path_trace_works[i].get(), slice_params);

-    current_y += slice_params.window_height;
+    current_y += slice_sizes[i];
+    VLOG_INFO << "(" << i << ") Slice size:" << slice_sizes[i] << " weight:" << work_balance_infos[i].weight;
  }
 }

@ -292,6 +341,7 @@ void PathTrace::update_allocated_work_buffer_params()
                               work_balance_infos_,
                               big_tile_params_,
                               overscan,
+                               interleaved_slices,
                               [](PathTraceWork *path_trace_work, const BufferParams &params) {
                                 RenderBuffers *buffers = path_trace_work->get_render_buffers();
                                 buffers->reset(params);
@ -334,6 +384,7 @@ void PathTrace::update_effective_work_buffer_params(const RenderWork &render_wor
                               work_balance_infos_,
                               scaled_big_tile_params,
                               overscan,
+                               interleaved_slices,
                               [&](PathTraceWork *path_trace_work, const BufferParams params) {
                                 path_trace_work->set_effective_buffer_params(
                                     scaled_full_params, scaled_big_tile_params, params);
@ -411,6 +462,10 @@ void PathTrace::path_trace(RenderWork &render_work)
              << " seconds per sample), occupancy: " << statistics.occupancy;
  });

+  const double work_time = time_dt() - start_time;
+  VLOG(3) << "render time total for frame: "
+          << " " << work_time;
+
  float occupancy_accum = 0.0f;
  for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
    occupancy_accum += balance_info.occupancy;
@ -418,8 +473,7 @@ void PathTrace::path_trace(RenderWork &render_work)
  const float occupancy = occupancy_accum / num_works;
  render_scheduler_.report_path_trace_occupancy(render_work, occupancy);

-  render_scheduler_.report_path_trace_time(
-      render_work, time_dt() - start_time, is_cancel_requested());
+  render_scheduler_.report_path_trace_time(render_work, work_time, is_cancel_requested());
 }

 void PathTrace::adaptive_sample(RenderWork &render_work)
@ -865,7 +919,7 @@ void PathTrace::tile_buffer_write_to_disk()
  }

  /* Get access to the CPU-side render buffers of the current big tile. */
-  RenderBuffers *buffers;
+  RenderBuffers *buffers = NULL;
  RenderBuffers big_tile_cpu_buffers(cpu_device_.get());

  if (path_trace_works_.size() == 1) {
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@ -182,6 +182,12 @@ class PathTrace {
   * that the buffer is "uniformly" sampled at the moment of this callback). */
  function<void(void)> progress_update_cb;

+  void set_interleaved_slices(bool setting)
+  {
+    interleaved_slices = setting;
+    VLOG_INFO << "Use interleaved slices:" << interleaved_slices;
+  }
+
 protected:
  /* Actual implementation of the rendering pipeline.
   * Calls steps in order, checking for the cancel to be requested in between.
@ -350,6 +356,8 @@ class PathTrace {
  struct {
    RenderBuffers *render_buffers = nullptr;
  } full_frame_state_;
+
+  bool interleaved_slices = true;
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/integrator/path_trace_display.cpp
+++ b/intern/cycles/integrator/path_trace_display.cpp
@ -91,8 +91,13 @@ int2 PathTraceDisplay::get_texture_size() const
 * Texture update from CPU buffer.
 */

-void PathTraceDisplay::copy_pixels_to_texture(
-    const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
+void PathTraceDisplay::copy_pixels_to_texture(const half4 *rgba_pixels,
+                                              int texture_x,
+                                              int texture_y,
+                                              int pixels_width,
+                                              int pixels_height,
+                                              int slice_height,
+                                              int slice_stride)
 {
  DCHECK(update_state_.is_active);

@ -116,20 +121,17 @@ void PathTraceDisplay::copy_pixels_to_texture(

  const int texture_width = texture_state_.size.x;
  const int texture_height = texture_state_.size.y;
-
-  if (texture_x == 0 && texture_y == 0 && pixels_width == texture_width &&
-      pixels_height == texture_height)
-  {
-    const size_t size_in_bytes = sizeof(half4) * texture_width * texture_height;
-    memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes);
-  }
-  else {
-    const half4 *rgba_row = rgba_pixels;
-    half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_width + texture_x;
-    for (int y = 0; y < pixels_height;
-         ++y, rgba_row += pixels_width, mapped_rgba_row += texture_width) {
-      memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width);
+  
+  const half4 *rgba_row = rgba_pixels;
+  half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_width + texture_x;
+  /* loop over each slice */
+  for (int y = 0; y < pixels_height; y += slice_height) {
+    int height = std::min(slice_height, pixels_height - y);
+    half4 *dest = mapped_rgba_row;
+    for (int rows = 0; rows < height; ++rows, rgba_row += pixels_width, dest += texture_width) {
+      memcpy(dest, rgba_row, sizeof(half4) * pixels_width);
    }
+    mapped_rgba_row += texture_width * slice_stride;
  }

  unmap_texture_buffer();
--- a/intern/cycles/integrator/path_trace_display.h
+++ b/intern/cycles/integrator/path_trace_display.h
@ -71,8 +71,13 @@ class PathTraceDisplay {
   * for partial updates from different devices. In this case the caller will acquire the lock
   * once, update all the slices and release
   * the lock once. This will ensure that draw() will never use partially updated texture. */
-  void copy_pixels_to_texture(
-      const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height);
+  void copy_pixels_to_texture(const half4 *rgba_pixels,
+                              int texture_x,
+                              int texture_y,
+                              int pixels_width,
+                              int pixels_height,
+                              int slice_height,
+                              int slice_stride);

  /* --------------------------------------------------------------------
   * Texture buffer mapping.
--- a/intern/cycles/integrator/path_trace_work.cpp
+++ b/intern/cycles/integrator/path_trace_work.cpp
@ -74,64 +74,84 @@ bool PathTraceWork::has_multiple_works() const
 void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers)
 {
  copy_render_buffers_from_device();
-
+  const int y_stride = effective_buffer_params_.slice_stride;
+  const int slice_height = effective_buffer_params_.slice_height;
+  const int total_height = effective_buffer_params_.height;
  const int64_t width = effective_buffer_params_.width;
-  const int64_t height = effective_buffer_params_.height;
  const int64_t pass_stride = effective_buffer_params_.pass_stride;
  const int64_t row_stride = width * pass_stride;
-  const int64_t data_size = row_stride * height * sizeof(float);
+  const int64_t data_size = row_stride * sizeof(float);

-  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
-  const int64_t offset_in_floats = offset_y * row_stride;
+  int y_render = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  for (int y_slice = 0; y_slice < total_height; y_slice += slice_height) {
+    int height = std::min(total_height - y_slice, slice_height);

-  const float *src = buffers_->buffer.data();
-  float *dst = render_buffers->buffer.data() + offset_in_floats;
+    const float *src = buffers_->buffer.data() + y_slice * row_stride;
+    float *dst = render_buffers->buffer.data() + y_render * row_stride;

-  memcpy(dst, src, data_size);
+    memcpy(dst, src, data_size * height);
+    y_render += y_stride;
+  }
 }

 void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers)
 {
+  const int y_stride = effective_buffer_params_.slice_stride;
+  const int slice_height = effective_buffer_params_.slice_height;
+  const int total_height = effective_buffer_params_.height;
  const int64_t width = effective_buffer_params_.width;
-  const int64_t height = effective_buffer_params_.height;
  const int64_t pass_stride = effective_buffer_params_.pass_stride;
  const int64_t row_stride = width * pass_stride;
-  const int64_t data_size = row_stride * height * sizeof(float);
+  const int64_t data_size = row_stride * sizeof(float);

-  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
-  const int64_t offset_in_floats = offset_y * row_stride;
-
-  const float *src = render_buffers->buffer.data() + offset_in_floats;
-  float *dst = buffers_->buffer.data();
-
-  memcpy(dst, src, data_size);
+  int y_render = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  for (int y_slice = 0; y_slice < total_height; y_slice += slice_height) {
+    int height = std::min(total_height - y_slice, slice_height);
+    const float *src = render_buffers->buffer.data() + y_render * row_stride;
+    float *dst = buffers_->buffer.data() + y_slice * row_stride;

+    memcpy(dst, src, data_size * height);
+    y_render += y_stride;
+  }
  copy_render_buffers_to_device();
 }

 void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers)
 {
  const int64_t width = effective_buffer_params_.width;
-  const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
-  const int64_t offset = offset_y * width;
+  const int y_stride = effective_buffer_params_.slice_stride;
+  const int slice_height = effective_buffer_params_.slice_height;
+  const int total_height = effective_buffer_params_.height;

-  render_buffers_host_copy_denoised(
-      buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset);
+  int64_t y_render = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
+  for (int y_slice = 0; y_slice < total_height; y_slice += slice_height) {
+    const int64_t dst_offset = y_render * width;
+    const int64_t src_offset = y_slice * width;

+    int height = std::min(total_height - y_slice, slice_height);
+    render_buffers_host_copy_denoised(buffers_.get(),
+                                      effective_buffer_params_,
+                                      src_offset,
+                                      height,
+                                      render_buffers,
+                                      effective_buffer_params_,
+                                      dst_offset);
+    y_render += y_stride;
+  }
  copy_render_buffers_to_device();
 }

 bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor,
                                           const PassAccessor::Destination &destination)
 {
-  const int offset_y = (effective_buffer_params_.full_y + effective_buffer_params_.window_y) -
-                       (effective_big_tile_params_.full_y + effective_big_tile_params_.window_y);
-  const int width = effective_buffer_params_.width;
+ const int offset_y = (effective_buffer_params_.full_y + effective_buffer_params_.window_y) -
+                         (effective_big_tile_params_.full_y + effective_big_tile_params_.window_y);
+ const int width = effective_buffer_params_.width;

-  PassAccessor::Destination slice_destination = destination;
-  slice_destination.offset += offset_y * width;
+ PassAccessor::Destination slice_destination = destination;
+ slice_destination.offset += offset_y * width;

-  return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
+ return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
 }

 bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor,
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@ -93,6 +93,9 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
      work_tile.num_samples = 1;
      work_tile.offset = effective_buffer_params_.offset;
      work_tile.stride = effective_buffer_params_.stride;
+      work_tile.slice_start_y = effective_buffer_params_.full_y;
+      work_tile.slice_height = effective_buffer_params_.slice_height;
+      work_tile.slice_stride = effective_buffer_params_.slice_stride;

      CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);

--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -307,12 +307,15 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
   * schedules work in halves of available number of paths. */
  work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
  work_tile_scheduler_.set_accelerated_rt(
-      (device_->get_bvh_layout_mask(device_scene_->data.kernel_features) & BVH_LAYOUT_OPTIX) != 0);
+      (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) != 0);
  work_tile_scheduler_.reset(effective_buffer_params_,
                             start_sample,
                             samples_num,
                             sample_offset,
-                             device_scene_->data.integrator.scrambling_distance);
+                             device_scene_->data.integrator.scrambling_distance,
+                             effective_buffer_params_.full_y,
+                             effective_buffer_params_.slice_height,
+                             effective_buffer_params_.slice_stride);

  enqueue_reset();

@ -979,7 +982,13 @@ void PathTraceWorkGPU::copy_to_display_naive(PathTraceDisplay *display,
  queue_->copy_from_device(display_rgba_half_);
  queue_->synchronize();

-  display->copy_pixels_to_texture(display_rgba_half_.data(), texture_x, texture_y, width, height);
+  display->copy_pixels_to_texture(display_rgba_half_.data(),
+                                  texture_x,
+                                  texture_y,
+                                  width,
+                                  height,
+                                  effective_buffer_params_.slice_height,
+                                  effective_buffer_params_.slice_stride);
 }

 bool PathTraceWorkGPU::copy_to_display_interop(PathTraceDisplay *display,
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@ -28,7 +28,10 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
                              int sample_start,
                              int samples_num,
                              int sample_offset,
-                              float scrambling_distance)
+                              float scrambling_distance,
+                              int slice_start_y,
+                              int slice_height,
+                              int slice_stride)
 {
  /* Image buffer parameters. */
  image_full_offset_px_.x = buffer_params.full_x;
@ -45,6 +48,11 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
  samples_num_ = samples_num;
  sample_offset_ = sample_offset;

+  /* Slice parameters */
+  slice_start_y_ = slice_start_y;
+  slice_height_ = slice_height;
+  slice_stride_ = slice_stride;
+
  /* Initialize new scheduling. */
  reset_scheduler_state();
 }
@ -116,6 +124,11 @@ bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_
  work_tile.offset = offset_;
  work_tile.stride = stride_;

+  /* Add slice stride and height */
+  work_tile.slice_start_y = slice_start_y_;
+  work_tile.slice_height = slice_height_;
+  work_tile.slice_stride = slice_stride_;
+
  work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x);
  work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y);

--- a/intern/cycles/integrator/work_tile_scheduler.h
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@ -33,7 +33,10 @@ class WorkTileScheduler {
             int sample_start,
             int samples_num,
             int sample_offset,
-             float scrambling_distance);
+             float scrambling_distance,
+             int slice_start_y,
+             int slice_height,
+             int slice_stride);

  /* Get work for a device.
   * Returns true if there is still work to be done and initialize the work tile to all
@ -76,6 +79,11 @@ class WorkTileScheduler {
  int samples_num_ = 0;
  int sample_offset_ = 0;

+  /* Slice parameters */
+  int slice_height_;  /* number of scanlines in a slice */
+  int slice_stride_;  /* stride between slices */
+  int slice_start_y_; /* starting y of slice */
+
  /* Tile size which be scheduled for rendering. */
  TileSize tile_size_;

--- a/intern/cycles/kernel/integrator/init_from_camera.h
+++ b/intern/cycles/kernel/integrator/init_from_camera.h
@ -77,13 +77,19 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
  const int sample = film_write_sample(
      kg, state, render_buffer, scheduled_sample, tile->sample_offset);

+  /* Map the buffer coordinates to the image coordinates */
+  int tile_y = y - tile->slice_start_y;
+  int slice_count = tile_y / tile->slice_height;
+  tile_y = tile_y - slice_count * tile->slice_height;
+  tile_y = tile->slice_stride * slice_count + tile_y + tile->slice_start_y;
+
  /* Initialize random number seed for path. */
-  const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
+  const uint rng_hash = path_rng_hash_init(kg, sample, x, tile_y);

  {
    /* Generate camera ray. */
    Ray ray;
-    integrate_camera_sample(kg, sample, x, y, rng_hash, &ray);
+    integrate_camera_sample(kg, sample, x, tile_y, rng_hash, &ray);
    if (ray.tmax == 0.0f) {
      return true;
    }
--- a/intern/cycles/kernel/types.h
+++ b/intern/cycles/kernel/types.h
@ -1558,6 +1558,12 @@ typedef struct KernelWorkTile {
  /* Precalculated parameters used by init_from_camera kernel on GPU. */
  int path_index_offset;
  int work_size;
+
+  /* slice details */
+  uint slice_start_y;
+  uint slice_height;
+  uint slice_stride;
+
 } KernelWorkTile;

 /* Shader Evaluation.
--- a/intern/cycles/session/buffers.cpp
+++ b/intern/cycles/session/buffers.cpp
@ -95,6 +95,9 @@ NODE_DEFINE(BufferParams)
  SOCKET_INT(full_width, "Full Width", 0);
  SOCKET_INT(full_height, "Full Height", 0);

+  SOCKET_INT(slice_stride, "Slice Stride", 0);
+  SOCKET_INT(slice_height, "Slice height", 0);
+
  SOCKET_STRING(layer, "Layer", ustring());
  SOCKET_STRING(view, "View", ustring());
  SOCKET_INT(samples, "Samples", 0);
@ -305,6 +308,8 @@ void RenderBuffers::copy_to_device()

 void render_buffers_host_copy_denoised(RenderBuffers *dst,
                                       const BufferParams &dst_params,
+                                       const size_t dst_offset,
+                                       const size_t dst_height,
                                       const RenderBuffers *src,
                                       const BufferParams &src_params,
                                       const size_t src_offset)
@ -345,15 +350,15 @@ void render_buffers_host_copy_denoised(RenderBuffers *dst,
  /* TODO(sergey): Make it more reusable, allowing implement copy of noisy passes. */

  const int64_t dst_width = dst_params.width;
-  const int64_t dst_height = dst_params.height;
  const int64_t dst_pass_stride = dst_params.pass_stride;
  const int64_t dst_num_pixels = dst_width * dst_height;
+  const int64_t dst_offset_in_floats = dst_offset * dst_pass_stride;

  const int64_t src_pass_stride = src_params.pass_stride;
  const int64_t src_offset_in_floats = src_offset * src_pass_stride;

  const float *src_pixel = src->buffer.data() + src_offset_in_floats;
-  float *dst_pixel = dst->buffer.data();
+  float *dst_pixel = dst->buffer.data() + dst_offset_in_floats;

  for (int i = 0; i < dst_num_pixels;
       ++i, src_pixel += src_pass_stride, dst_pixel += dst_pass_stride)
--- a/intern/cycles/session/buffers.h
+++ b/intern/cycles/session/buffers.h
@ -93,6 +93,10 @@ class BufferParams : public Node {
  /* Runtime fields, only valid after `update_passes()`. */
  int pass_stride = -1;

+  /* Slice details */
+  int slice_stride;
+  int slice_height;
+
  /* Properties which are used for accessing buffer pixels outside of scene graph. */
  vector<BufferPass> passes;
  ustring layer;
@ -180,6 +184,8 @@ class RenderBuffers {
 * Copy happens of the number of pixels in the destination. */
 void render_buffers_host_copy_denoised(RenderBuffers *dst,
                                       const BufferParams &dst_params,
+                                       const size_t dst_offset,
+                                       const size_t dst_height,
                                       const RenderBuffers *src,
                                       const BufferParams &src_params,
                                       const size_t src_offset = 0);
--- a/intern/cycles/session/session.cpp
+++ b/intern/cycles/session/session.cpp
@ -55,6 +55,7 @@ Session::Session(const SessionParams &params_, const SceneParams &scene_params)
      device, scene->film, &scene->dscene, render_scheduler_, tile_manager_);
  path_trace_->set_progress(&progress);
  path_trace_->progress_update_cb = [&]() { update_status_time(); };
+  path_trace_->set_interleaved_slices(params.interleaved_slices);

  tile_manager_.full_buffer_written_cb = [&](string_view filename) {
    if (!full_buffer_written_cb) {
--- a/intern/cycles/session/session.h
+++ b/intern/cycles/session/session.h
@ -58,6 +58,7 @@ class SessionParams {
  bool use_resolution_divider;

  ShadingSystem shadingsystem;
+  int interleaved_slices = true;

  /* Session-specific temporary directory to store in-progress EXR files in. */
  string temp_dir;
@ -82,6 +83,7 @@ class SessionParams {
    use_resolution_divider = true;

    shadingsystem = SHADINGSYSTEM_SVM;
+    interleaved_slices = false;
  }

  bool modified(const SessionParams &params) const