WIP:Interleaved slices for better work distriubtion with a Multi-GPU setup #110348

Draft
William Leeson wants to merge 82 commits from leesonw/blender-cluster:work_sets_similar into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
22 changed files with 350 additions and 99 deletions

View File

@ -978,6 +978,12 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
min=8, max=8192,
)
interleaved_slices: BoolProperty(
Review

Outside of the development purposes I don't think this should be an option.

If we plan to have it for the future development/investigation better move to the debug panel.
Otherwise perhaps just remove the option.

Outside of the development purposes I don't think this should be an option. If we plan to have it for the future development/investigation better move to the debug panel. Otherwise perhaps just remove the option.
name="Interleaved Slices",
default=True,
description="If true work is distribuited as many small interleaved work slices so as to distribute the work more evenly otherwise a single big slice per device is used"
)
# Various fine-tuning debug flags
def _devices_update_callback(self, context):

View File

@ -866,6 +866,22 @@ class CYCLES_RENDER_PT_performance_viewport(CyclesButtonsPanel, Panel):
col.prop(rd, "preview_pixel_size", text="Pixel Size")
class CYCLES_RENDER_PT_performance_multiple_device(CyclesButtonsPanel, Panel):
bl_label = "Multiple Device"
bl_parent_id = "CYCLES_RENDER_PT_performance"
def draw(self, context):
layout = self.layout
layout.use_property_split = True
layout.use_property_decorate = False
scene = context.scene
cscene = scene.cycles
col = layout.column()
col.prop(cscene, "interleaved_slices")
class CYCLES_RENDER_PT_filter(CyclesButtonsPanel, Panel):
bl_label = "Filter"
bl_options = {'DEFAULT_CLOSED'}
@ -2526,6 +2542,7 @@ classes = (
CYCLES_RENDER_PT_performance_acceleration_structure,
CYCLES_RENDER_PT_performance_final_render,
CYCLES_RENDER_PT_performance_viewport,
CYCLES_RENDER_PT_performance_multiple_device,
CYCLES_RENDER_PT_passes,
CYCLES_RENDER_PT_passes_data,
CYCLES_RENDER_PT_passes_light,

View File

@ -851,6 +851,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
SessionParams params;
PointerRNA cscene = RNA_pointer_get(&b_scene.ptr, "cycles");
params.interleaved_slices = get_boolean(cscene, "interleaved_slices");
if (background && !b_engine.is_preview()) {
/* Viewport and preview renders do not require temp directory and do request session
* parameters more often than the background render.

View File

@ -13,6 +13,7 @@
# include "device/metal/bvh.h"
# include "device/metal/util.h"
# include "device/device.h"
CCL_NAMESPACE_BEGIN

View File

@ -81,8 +81,12 @@ bool DenoiserGPU::denoise_buffer(const BufferParams &buffer_params,
if (local_buffer_used) {
local_render_buffers.copy_from_device();
render_buffers_host_copy_denoised(
render_buffers, buffer_params, &local_render_buffers, local_render_buffers.params);
render_buffers_host_copy_denoised(render_buffers,
buffer_params,
0,
buffer_params.height,
&local_render_buffers,
local_render_buffers.params);
render_buffers->copy_to_device();
}

View File

@ -88,21 +88,30 @@ static void pad_pixels(const BufferParams &buffer_params,
return;
}
const size_t size = static_cast<size_t>(buffer_params.width) * buffer_params.height;
const int width = buffer_params.width;
const int slice_stride = buffer_params.slice_stride;
const int slice_height = buffer_params.slice_height;
const int total_height = buffer_params.window_height;
if (destination.pixels) {
const size_t pixel_stride = destination.pixel_stride ? destination.pixel_stride :
destination.num_components;
float *pixel = destination.pixels + pixel_stride * destination.offset;
for (size_t i = 0; i < size; i++, pixel += dest_num_components) {
if (dest_num_components >= 3 && src_num_components == 1) {
pixel[1] = pixel[0];
pixel[2] = pixel[0];
}
if (dest_num_components >= 4) {
pixel[3] = 1.0f;
for (int slice_y = 0; slice_y < total_height; slice_y += slice_height) {
float *dst = pixel;
const int height = std::min(slice_height, total_height - slice_y);
const int size = width * height;
for (size_t i = 0; i < size; i++, dst += dest_num_components) {
if (dest_num_components >= 3 && src_num_components == 1) {
dst[1] = dst[0];
dst[2] = dst[0];
}
if (dest_num_components >= 4) {
dst[3] = 1.0f;
}
}
pixel += slice_stride * width * dest_num_components;
}
}
@ -110,14 +119,20 @@ static void pad_pixels(const BufferParams &buffer_params,
const half one = float_to_half_display(1.0f);
half4 *pixel = destination.pixels_half_rgba + destination.offset;
for (size_t i = 0; i < size; i++, pixel++) {
if (dest_num_components >= 3 && src_num_components == 1) {
pixel[0].y = pixel[0].x;
pixel[0].z = pixel[0].x;
}
if (dest_num_components >= 4) {
pixel[0].w = one;
for (int slice_y = 0; slice_y < total_height; slice_y += slice_height) {
half4 *dst = pixel;
const int height = std::min(slice_height, total_height - slice_y);
const int size = width * height;
for (size_t i = 0; i < size; i++, dst++) {
if (dest_num_components >= 3 && src_num_components == 1) {
dst[0].y = dst[0].x;
dst[0].z = dst[0].x;
}
if (dest_num_components >= 4) {
dst[0].w = one;
}
}
pixel += slice_stride * width;
}
}
}
@ -298,7 +313,6 @@ bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const S
const BufferParams &buffer_params = render_buffers->params;
float *buffer_data = render_buffers->buffer.data();
const int size = buffer_params.width * buffer_params.height;
const int out_stride = buffer_params.pass_stride;
const int in_stride = source.num_components;
@ -306,11 +320,18 @@ bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const S
float *out = buffer_data + pass_access_info_.offset;
const float *in = source.pixels + source.offset * in_stride;
for (int i = 0; i < size; i++, out += out_stride, in += in_stride) {
memcpy(out, in, sizeof(float) * num_components_to_copy);
const int slice_height = buffer_params.slice_height;
const int total_height = buffer_params.height;
const int slice_stride = buffer_params.slice_stride;
for (int y = 0; y < total_height; y += slice_height) {
const int height = std::min(slice_height, total_height - y);
const float *src = in;
const int size = height * buffer_params.width;
for (int i = 0; i < size; i++, out += out_stride, src += in_stride) {
memcpy(out, src, sizeof(float) * num_components_to_copy);
}
in += slice_stride * buffer_params.width * in_stride;
}
return true;
}

View File

@ -45,11 +45,38 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
destination.num_components;
parallel_for(0, buffer_params.window_height, [&](int64_t y) {
/* Calculate how many full plus partial slices there are */
Review

Fullstops in the comments.

Fullstops in the comments.
int slice_height;
int slice_stride;
int slices;
if(buffer_params.slice_height > 0) {
/* Copy over each slice */
slices = buffer_params.window_height/buffer_params.slice_height;
slices += (slices*buffer_params.slice_height < buffer_params.window_height) ? 1 : 0;
slice_height = buffer_params.slice_height;
slice_stride = buffer_params.slice_stride;
}
else {
/* Assign each row to a slice */
slices = buffer_params.window_height;
slice_height = 1;
slice_stride = 1;
}
/* Copy over each slice to the destination */
parallel_for(0, slices, [&](int slice) {
//for(int slice = 0;slice < slices;++slice) {
Review

Remove the dead code. Also seems that the clang-format is not properly configured in your setup.

Remove the dead code. Also seems that the clang-format is not properly configured in your setup.
int y = slice*slice_height;
const float *buffer = window_data + y * buffer_row_stride;
float *pixel = destination.pixels +
(y * buffer_params.width + destination.offset) * pixel_stride;
func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride, pixel_stride);
int height = std::min(slice_height, buffer_params.window_height - y);
int pixel_y = slice * slice_stride;
float *pixels = destination.pixels +
(pixel_y * buffer_params.width + destination.offset) * pixel_stride;
for (int row = 0; row < height;
row++, buffer += buffer_row_stride, pixels += buffer_params.width * pixel_stride)
{
func(kfilm_convert, buffer, pixels, buffer_params.window_width, pass_stride, pixel_stride);
}
});
}
@ -70,10 +97,35 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
const int destination_stride = destination.stride != 0 ? destination.stride :
buffer_params.width;
parallel_for(0, buffer_params.window_height, [&](int64_t y) {
/* Calculate how many full plus partial slices there are */
int slice_height;
int slice_stride;
int slices;
if(buffer_params.slice_height > 0) {
/* Copy over each slice */
slices = buffer_params.window_height/buffer_params.slice_height;
slices += (slices*buffer_params.slice_height < buffer_params.window_height) ? 1 : 0;
slice_height = buffer_params.slice_height;
slice_stride = buffer_params.slice_stride;
}
else {
/* Assign each row to a slice */
slices = buffer_params.window_height;
slice_height = 1;
slice_stride = 1;
}
parallel_for(0, slices, [&](int slice) {
//for (int slice = 0; slice < slices; ++slice) {
int y = slice*slice_height;
const float *buffer = window_data + y * buffer_row_stride;
half4 *pixel = dst_start + y * destination_stride;
func(kfilm_convert, buffer, pixel, buffer_params.window_width, pass_stride);
int height = std::min(slice_height, buffer_params.window_height - y);
int pixel_y = slice * buffer_params.slice_stride;
half4 *pixels = dst_start + pixel_y * destination_stride;
for (int row = 0; row < height;
row++, buffer += buffer_row_stride, pixels += destination_stride) {
func(kfilm_convert, buffer, pixels, buffer_params.window_width, pass_stride);
}
});
}

View File

@ -50,7 +50,6 @@ PathTrace::PathTrace(Device *device,
path_trace_works_.emplace_back(std::move(work));
}
});
work_balance_infos_.resize(path_trace_works_.size());
work_balance_do_initial(work_balance_infos_);
@ -246,42 +245,92 @@ static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>>
const vector<WorkBalanceInfo> &work_balance_infos,
const BufferParams &buffer_params,
const int overscan,
const bool interleaved_slices,
const Callback &callback)
{
const int num_works = path_trace_works.size();
const int window_height = buffer_params.window_height;
/* Find the largest and smallest weights */
int largest_weight = 0;
int smallest_weight = 0;
for (int i = 0; i < num_works; i++) {
double weight = work_balance_infos[i].weight;
if (weight > work_balance_infos[largest_weight].weight) {
largest_weight = i;
}
if (weight < work_balance_infos[smallest_weight].weight) {
smallest_weight = i;
}
}
int biggest_slice = work_balance_infos[largest_weight].weight /
work_balance_infos[smallest_weight].weight;
int slice_stride = 0;
int slice_sizes[num_works];
int remaining_rows = window_height;
double allocatable_slices;
double fixed_slices;
bool use_interleaved = (biggest_slice < (window_height / num_works)) && interleaved_slices;
if (use_interleaved) {
/* Assign the slices entirely based on the weight */
allocatable_slices = 1.0 / work_balance_infos[smallest_weight].weight;
fixed_slices = 0.5; /* Round up so that smallest always gets 1 */
}
else {
/* Instead of using interleaved slices create n bigger consecutive slices */
/* Aim to acheive at least 1 slice per device otherwise use consecutive slices */
allocatable_slices = remaining_rows - num_works; /* each slice must have at least 1 row */
fixed_slices = 1; /* Make sure all slices get at least 1 */
}
/* Assign a size to each slice based on its weight */
for (int i = 0; i < num_works; i++) {
int slice_size = std::floor(work_balance_infos[i].weight * allocatable_slices + fixed_slices);
slice_sizes[i] = slice_size;
slice_stride += slice_size;
}
/* If there are any remaining scanlines due to truncation add them to the device with the
* highest weight */
int leftover_scanlines = allocatable_slices - slice_stride;
if (leftover_scanlines > 0) {
slice_sizes[largest_weight] += leftover_scanlines;
slice_stride++;
} else if(leftover_scanlines < 0) {
VLOG_WARNING << "#######Used to many scanlines";
Review

Not sure about details, but seems that VLOG_WARNING << "Used to many scanlines" will be much better suit here.

If this is something that must get a developer attention can use DCHECK_GE(leftover_scanlines, 0) above.

Not sure about details, but seems that `VLOG_WARNING << "Used to many scanlines"` will be much better suit here. If this is something that must get a developer attention can use `DCHECK_GE(leftover_scanlines, 0)` above.
}
VLOG_INFO << "===================SLICE allocatable:" << allocatable_slices << " fixed:"<< fixed_slices << "================";
Review

Not sure I'd keep such explicit separation. Surely, it is important for this PR, but outside of this PR it feels it'll draw too much attention.

Not sure I'd keep such explicit separation. Surely, it is important for this PR, but outside of this PR it feels it'll draw too much attention.
int slices = window_height / slice_stride;
int current_y = 0;
for (int i = 0; i < num_works; ++i) {
const double weight = work_balance_infos[i].weight;
const int slice_window_full_y = buffer_params.full_y + buffer_params.window_y + current_y;
const int slice_window_height = max(lround(window_height * weight), 1);
/* Disallow negative values to deal with situations when there are more compute devices than
* scan-lines. */
const int remaining_window_height = max(0, window_height - current_y);
const int slice_left_at_end = std::max(0, window_height - slices * slice_stride - current_y);
const int slice_window_height =
slices * slice_sizes[i] +
std::min(slice_sizes[i], slice_left_at_end);
BufferParams slice_params = buffer_params;
slice_params.full_y = max(slice_window_full_y - overscan, buffer_params.full_y);
slice_params.window_y = slice_window_full_y - slice_params.full_y;
if (i < num_works - 1) {
slice_params.window_height = min(slice_window_height, remaining_window_height);
}
else {
slice_params.window_height = remaining_window_height;
}
slice_params.window_height = slice_window_height;
slice_params.height = slice_params.window_y + slice_params.window_height + overscan;
slice_params.height = min(slice_params.height,
buffer_params.height + buffer_params.full_y - slice_params.full_y);
slice_params.slice_height = slice_sizes[i];
slice_params.slice_stride = slice_stride;
slice_params.update_offset_stride();
callback(path_trace_works[i].get(), slice_params);
current_y += slice_params.window_height;
current_y += slice_sizes[i];
VLOG_INFO << "(" << i << ") Slice size:" << slice_sizes[i] << " weight:" << work_balance_infos[i].weight;
}
}
@ -292,6 +341,7 @@ void PathTrace::update_allocated_work_buffer_params()
work_balance_infos_,
big_tile_params_,
overscan,
interleaved_slices,
[](PathTraceWork *path_trace_work, const BufferParams &params) {
RenderBuffers *buffers = path_trace_work->get_render_buffers();
buffers->reset(params);
@ -334,6 +384,7 @@ void PathTrace::update_effective_work_buffer_params(const RenderWork &render_wor
work_balance_infos_,
scaled_big_tile_params,
overscan,
interleaved_slices,
[&](PathTraceWork *path_trace_work, const BufferParams params) {
path_trace_work->set_effective_buffer_params(
scaled_full_params, scaled_big_tile_params, params);
@ -411,6 +462,10 @@ void PathTrace::path_trace(RenderWork &render_work)
<< " seconds per sample), occupancy: " << statistics.occupancy;
});
const double work_time = time_dt() - start_time;
VLOG(3) << "render time total for frame: "
Review

I don't think it is a total time. It is a path tracing time for the current work. There might be more path tracing needing to be done, and there are other types of work as well.

Perhaps it will be much better to log "bare" time from the RenderScheduler::report_*_time() functions.

I don't think it is a total time. It is a path tracing time for the current work. There might be more path tracing needing to be done, and there are other types of work as well. Perhaps it will be much better to log "bare" time from the `RenderScheduler::report_*_time()` functions.
<< " " << work_time;
float occupancy_accum = 0.0f;
for (const WorkBalanceInfo &balance_info : work_balance_infos_) {
occupancy_accum += balance_info.occupancy;
@ -418,8 +473,7 @@ void PathTrace::path_trace(RenderWork &render_work)
const float occupancy = occupancy_accum / num_works;
render_scheduler_.report_path_trace_occupancy(render_work, occupancy);
render_scheduler_.report_path_trace_time(
render_work, time_dt() - start_time, is_cancel_requested());
render_scheduler_.report_path_trace_time(render_work, work_time, is_cancel_requested());
}
void PathTrace::adaptive_sample(RenderWork &render_work)
@ -865,7 +919,7 @@ void PathTrace::tile_buffer_write_to_disk()
}
/* Get access to the CPU-side render buffers of the current big tile. */
RenderBuffers *buffers;
RenderBuffers *buffers = NULL;
RenderBuffers big_tile_cpu_buffers(cpu_device_.get());
if (path_trace_works_.size() == 1) {

View File

@ -182,6 +182,12 @@ class PathTrace {
* that the buffer is "uniformly" sampled at the moment of this callback). */
function<void(void)> progress_update_cb;
void set_interleaved_slices(bool setting)
{
interleaved_slices = setting;
VLOG_INFO << "Use interleaved slices:" << interleaved_slices;
}
protected:
/* Actual implementation of the rendering pipeline.
* Calls steps in order, checking for the cancel to be requested in between.
@ -350,6 +356,8 @@ class PathTrace {
struct {
RenderBuffers *render_buffers = nullptr;
} full_frame_state_;
bool interleaved_slices = true;
};
CCL_NAMESPACE_END

View File

@ -91,8 +91,13 @@ int2 PathTraceDisplay::get_texture_size() const
* Texture update from CPU buffer.
*/
void PathTraceDisplay::copy_pixels_to_texture(
const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
void PathTraceDisplay::copy_pixels_to_texture(const half4 *rgba_pixels,
int texture_x,
int texture_y,
int pixels_width,
int pixels_height,
int slice_height,
int slice_stride)
{
DCHECK(update_state_.is_active);
@ -116,20 +121,17 @@ void PathTraceDisplay::copy_pixels_to_texture(
const int texture_width = texture_state_.size.x;
const int texture_height = texture_state_.size.y;
if (texture_x == 0 && texture_y == 0 && pixels_width == texture_width &&
pixels_height == texture_height)
{
const size_t size_in_bytes = sizeof(half4) * texture_width * texture_height;
memcpy(mapped_rgba_pixels, rgba_pixels, size_in_bytes);
}
else {
const half4 *rgba_row = rgba_pixels;
half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_width + texture_x;
for (int y = 0; y < pixels_height;
++y, rgba_row += pixels_width, mapped_rgba_row += texture_width) {
memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width);
const half4 *rgba_row = rgba_pixels;
half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_width + texture_x;
/* loop over each slice */
for (int y = 0; y < pixels_height; y += slice_height) {
int height = std::min(slice_height, pixels_height - y);
half4 *dest = mapped_rgba_row;
for (int rows = 0; rows < height; ++rows, rgba_row += pixels_width, dest += texture_width) {
memcpy(dest, rgba_row, sizeof(half4) * pixels_width);
}
mapped_rgba_row += texture_width * slice_stride;
}
unmap_texture_buffer();

View File

@ -71,8 +71,13 @@ class PathTraceDisplay {
* for partial updates from different devices. In this case the caller will acquire the lock
* once, update all the slices and release
* the lock once. This will ensure that draw() will never use partially updated texture. */
void copy_pixels_to_texture(
const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height);
void copy_pixels_to_texture(const half4 *rgba_pixels,
int texture_x,
int texture_y,
int pixels_width,
int pixels_height,
int slice_height,
int slice_stride);
/* --------------------------------------------------------------------
* Texture buffer mapping.

View File

@ -74,64 +74,84 @@ bool PathTraceWork::has_multiple_works() const
void PathTraceWork::copy_to_render_buffers(RenderBuffers *render_buffers)
{
copy_render_buffers_from_device();
const int y_stride = effective_buffer_params_.slice_stride;
const int slice_height = effective_buffer_params_.slice_height;
const int total_height = effective_buffer_params_.height;
const int64_t width = effective_buffer_params_.width;
const int64_t height = effective_buffer_params_.height;
const int64_t pass_stride = effective_buffer_params_.pass_stride;
const int64_t row_stride = width * pass_stride;
const int64_t data_size = row_stride * height * sizeof(float);
const int64_t data_size = row_stride * sizeof(float);
const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
const int64_t offset_in_floats = offset_y * row_stride;
int y_render = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
for (int y_slice = 0; y_slice < total_height; y_slice += slice_height) {
int height = std::min(total_height - y_slice, slice_height);
const float *src = buffers_->buffer.data();
float *dst = render_buffers->buffer.data() + offset_in_floats;
const float *src = buffers_->buffer.data() + y_slice * row_stride;
float *dst = render_buffers->buffer.data() + y_render * row_stride;
memcpy(dst, src, data_size);
memcpy(dst, src, data_size * height);
y_render += y_stride;
}
}
void PathTraceWork::copy_from_render_buffers(const RenderBuffers *render_buffers)
{
const int y_stride = effective_buffer_params_.slice_stride;
const int slice_height = effective_buffer_params_.slice_height;
const int total_height = effective_buffer_params_.height;
const int64_t width = effective_buffer_params_.width;
const int64_t height = effective_buffer_params_.height;
const int64_t pass_stride = effective_buffer_params_.pass_stride;
const int64_t row_stride = width * pass_stride;
const int64_t data_size = row_stride * height * sizeof(float);
const int64_t data_size = row_stride * sizeof(float);
const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
const int64_t offset_in_floats = offset_y * row_stride;
const float *src = render_buffers->buffer.data() + offset_in_floats;
float *dst = buffers_->buffer.data();
memcpy(dst, src, data_size);
int y_render = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
for (int y_slice = 0; y_slice < total_height; y_slice += slice_height) {
int height = std::min(total_height - y_slice, slice_height);
const float *src = render_buffers->buffer.data() + y_render * row_stride;
float *dst = buffers_->buffer.data() + y_slice * row_stride;
memcpy(dst, src, data_size * height);
y_render += y_stride;
}
copy_render_buffers_to_device();
}
void PathTraceWork::copy_from_denoised_render_buffers(const RenderBuffers *render_buffers)
{
const int64_t width = effective_buffer_params_.width;
const int64_t offset_y = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
const int64_t offset = offset_y * width;
const int y_stride = effective_buffer_params_.slice_stride;
const int slice_height = effective_buffer_params_.slice_height;
const int total_height = effective_buffer_params_.height;
render_buffers_host_copy_denoised(
buffers_.get(), effective_buffer_params_, render_buffers, effective_buffer_params_, offset);
int64_t y_render = effective_buffer_params_.full_y - effective_big_tile_params_.full_y;
for (int y_slice = 0; y_slice < total_height; y_slice += slice_height) {
const int64_t dst_offset = y_render * width;
const int64_t src_offset = y_slice * width;
int height = std::min(total_height - y_slice, slice_height);
render_buffers_host_copy_denoised(buffers_.get(),
effective_buffer_params_,
src_offset,
height,
render_buffers,
effective_buffer_params_,
dst_offset);
y_render += y_stride;
}
copy_render_buffers_to_device();
}
bool PathTraceWork::get_render_tile_pixels(const PassAccessor &pass_accessor,
const PassAccessor::Destination &destination)
{
const int offset_y = (effective_buffer_params_.full_y + effective_buffer_params_.window_y) -
(effective_big_tile_params_.full_y + effective_big_tile_params_.window_y);
const int width = effective_buffer_params_.width;
const int offset_y = (effective_buffer_params_.full_y + effective_buffer_params_.window_y) -
(effective_big_tile_params_.full_y + effective_big_tile_params_.window_y);
const int width = effective_buffer_params_.width;
PassAccessor::Destination slice_destination = destination;
slice_destination.offset += offset_y * width;
PassAccessor::Destination slice_destination = destination;
slice_destination.offset += offset_y * width;
return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
return pass_accessor.get_render_tile_pixels(buffers_.get(), slice_destination);
}
bool PathTraceWork::set_render_tile_pixels(PassAccessor &pass_accessor,

View File

@ -93,6 +93,9 @@ void PathTraceWorkCPU::render_samples(RenderStatistics &statistics,
work_tile.num_samples = 1;
work_tile.offset = effective_buffer_params_.offset;
work_tile.stride = effective_buffer_params_.stride;
work_tile.slice_start_y = effective_buffer_params_.full_y;
work_tile.slice_height = effective_buffer_params_.slice_height;
work_tile.slice_stride = effective_buffer_params_.slice_stride;
CPUKernelThreadGlobals *kernel_globals = kernel_thread_globals_get(kernel_thread_globals_);

View File

@ -307,12 +307,15 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
* schedules work in halves of available number of paths. */
work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
work_tile_scheduler_.set_accelerated_rt(
(device_->get_bvh_layout_mask(device_scene_->data.kernel_features) & BVH_LAYOUT_OPTIX) != 0);
(device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) != 0);
work_tile_scheduler_.reset(effective_buffer_params_,
start_sample,
samples_num,
sample_offset,
device_scene_->data.integrator.scrambling_distance);
device_scene_->data.integrator.scrambling_distance,
effective_buffer_params_.full_y,
effective_buffer_params_.slice_height,
effective_buffer_params_.slice_stride);
enqueue_reset();
@ -979,7 +982,13 @@ void PathTraceWorkGPU::copy_to_display_naive(PathTraceDisplay *display,
queue_->copy_from_device(display_rgba_half_);
queue_->synchronize();
display->copy_pixels_to_texture(display_rgba_half_.data(), texture_x, texture_y, width, height);
display->copy_pixels_to_texture(display_rgba_half_.data(),
texture_x,
texture_y,
width,
height,
effective_buffer_params_.slice_height,
effective_buffer_params_.slice_stride);
}
bool PathTraceWorkGPU::copy_to_display_interop(PathTraceDisplay *display,

View File

@ -28,7 +28,10 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
int sample_start,
int samples_num,
int sample_offset,
float scrambling_distance)
float scrambling_distance,
int slice_start_y,
int slice_height,
int slice_stride)
{
/* Image buffer parameters. */
image_full_offset_px_.x = buffer_params.full_x;
@ -45,6 +48,11 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
samples_num_ = samples_num;
sample_offset_ = sample_offset;
/* Slice parameters */
slice_start_y_ = slice_start_y;
slice_height_ = slice_height;
slice_stride_ = slice_stride;
/* Initialize new scheduling. */
reset_scheduler_state();
}
@ -116,6 +124,11 @@ bool WorkTileScheduler::get_work(KernelWorkTile *work_tile_, const int max_work_
work_tile.offset = offset_;
work_tile.stride = stride_;
/* Add slice stride and height */
work_tile.slice_start_y = slice_start_y_;
work_tile.slice_height = slice_height_;
work_tile.slice_stride = slice_stride_;
work_tile.w = min(work_tile.w, image_size_px_.x - work_tile.x);
work_tile.h = min(work_tile.h, image_size_px_.y - work_tile.y);

View File

@ -33,7 +33,10 @@ class WorkTileScheduler {
int sample_start,
int samples_num,
int sample_offset,
float scrambling_distance);
float scrambling_distance,
int slice_start_y,
int slice_height,
int slice_stride);
/* Get work for a device.
* Returns true if there is still work to be done and initialize the work tile to all
@ -76,6 +79,11 @@ class WorkTileScheduler {
int samples_num_ = 0;
int sample_offset_ = 0;
/* Slice parameters */
int slice_height_; /* number of scanlines in a slice */
int slice_stride_; /* stride between slices */
int slice_start_y_; /* starting y of slice */
/* Tile size which be scheduled for rendering. */
TileSize tile_size_;

View File

@ -77,13 +77,19 @@ ccl_device bool integrator_init_from_camera(KernelGlobals kg,
const int sample = film_write_sample(
kg, state, render_buffer, scheduled_sample, tile->sample_offset);
/* Map the buffer coordinates to the image coordinates */
int tile_y = y - tile->slice_start_y;
Review

Should the same mapping be done for the init_from_bake?

Should the same mapping be done for the `init_from_bake`?
int slice_count = tile_y / tile->slice_height;
tile_y = tile_y - slice_count * tile->slice_height;
tile_y = tile->slice_stride * slice_count + tile_y + tile->slice_start_y;
/* Initialize random number seed for path. */
const uint rng_hash = path_rng_hash_init(kg, sample, x, y);
const uint rng_hash = path_rng_hash_init(kg, sample, x, tile_y);
{
/* Generate camera ray. */
Ray ray;
integrate_camera_sample(kg, sample, x, y, rng_hash, &ray);
integrate_camera_sample(kg, sample, x, tile_y, rng_hash, &ray);
if (ray.tmax == 0.0f) {
return true;
}

View File

@ -1558,6 +1558,12 @@ typedef struct KernelWorkTile {
/* Precalculated parameters used by init_from_camera kernel on GPU. */
int path_index_offset;
int work_size;
/* slice details */
uint slice_start_y;
uint slice_height;
uint slice_stride;
} KernelWorkTile;
/* Shader Evaluation.

View File

@ -95,6 +95,9 @@ NODE_DEFINE(BufferParams)
SOCKET_INT(full_width, "Full Width", 0);
SOCKET_INT(full_height, "Full Height", 0);
SOCKET_INT(slice_stride, "Slice Stride", 0);
SOCKET_INT(slice_height, "Slice height", 0);
SOCKET_STRING(layer, "Layer", ustring());
SOCKET_STRING(view, "View", ustring());
SOCKET_INT(samples, "Samples", 0);
@ -305,6 +308,8 @@ void RenderBuffers::copy_to_device()
void render_buffers_host_copy_denoised(RenderBuffers *dst,
const BufferParams &dst_params,
const size_t dst_offset,
const size_t dst_height,
const RenderBuffers *src,
const BufferParams &src_params,
const size_t src_offset)
@ -345,15 +350,15 @@ void render_buffers_host_copy_denoised(RenderBuffers *dst,
/* TODO(sergey): Make it more reusable, allowing implement copy of noisy passes. */
const int64_t dst_width = dst_params.width;
const int64_t dst_height = dst_params.height;
const int64_t dst_pass_stride = dst_params.pass_stride;
const int64_t dst_num_pixels = dst_width * dst_height;
const int64_t dst_offset_in_floats = dst_offset * dst_pass_stride;
const int64_t src_pass_stride = src_params.pass_stride;
const int64_t src_offset_in_floats = src_offset * src_pass_stride;
const float *src_pixel = src->buffer.data() + src_offset_in_floats;
float *dst_pixel = dst->buffer.data();
float *dst_pixel = dst->buffer.data() + dst_offset_in_floats;
for (int i = 0; i < dst_num_pixels;
++i, src_pixel += src_pass_stride, dst_pixel += dst_pass_stride)

View File

@ -93,6 +93,10 @@ class BufferParams : public Node {
/* Runtime fields, only valid after `update_passes()`. */
int pass_stride = -1;
/* Slice details */
int slice_stride;
int slice_height;
/* Properties which are used for accessing buffer pixels outside of scene graph. */
vector<BufferPass> passes;
ustring layer;
@ -180,6 +184,8 @@ class RenderBuffers {
* Copy happens of the number of pixels in the destination. */
void render_buffers_host_copy_denoised(RenderBuffers *dst,
const BufferParams &dst_params,
const size_t dst_offset,
const size_t dst_height,
const RenderBuffers *src,
const BufferParams &src_params,
const size_t src_offset = 0);

View File

@ -55,6 +55,7 @@ Session::Session(const SessionParams &params_, const SceneParams &scene_params)
device, scene->film, &scene->dscene, render_scheduler_, tile_manager_);
path_trace_->set_progress(&progress);
path_trace_->progress_update_cb = [&]() { update_status_time(); };
path_trace_->set_interleaved_slices(params.interleaved_slices);
tile_manager_.full_buffer_written_cb = [&](string_view filename) {
if (!full_buffer_written_cb) {

View File

@ -58,6 +58,7 @@ class SessionParams {
bool use_resolution_divider;
ShadingSystem shadingsystem;
int interleaved_slices = true;
/* Session-specific temporary directory to store in-progress EXR files in. */
string temp_dir;
@ -82,6 +83,7 @@ class SessionParams {
use_resolution_divider = true;
shadingsystem = SHADINGSYSTEM_SVM;
interleaved_slices = false;
}
bool modified(const SessionParams &params) const