This repository has been archived on 2023-10-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
blender-archive/intern/cycles/kernel/device/gpu/work_stealing.h
Alaska b41c72b710 Fix performance decrease with Scrambling Distance on
With the current code in master, scrambling distance is enabled on non-hardware accelerated ray tracing devices see a measurable performance decrease when compared scrambling distance on vs off. From testing, this performance decrease comes from the large tile sizes scheduled in `tile.cpp`.

This patch attempts to address the performance decrease by using different algorithms to calculate the tile size for devices with hardware accelerated ray traversal and devices without. Large tile sizes for hardware accelerated devices and small tile sizes for others.

Most of this code is based on proposals from @brecht and @leesonw

Reviewed By: brecht, leesonw

Differential Revision: https://developer.blender.org/D13042
2021-11-25 09:32:26 +01:00

56 lines
1.8 KiB
C++

/*
* Copyright 2011-2015 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
CCL_NAMESPACE_BEGIN
/*
* Utility functions for work stealing
*/
/* Map global work index to tile, pixel X/Y and sample. */
ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
uint global_work_index,
ccl_private uint *x,
ccl_private uint *y,
ccl_private uint *sample)
{
uint sample_offset, pixel_offset;
if (kernel_data.integrator.scrambling_distance < 0.9f) {
/* Keep threads for the same sample together. */
uint tile_pixels = tile->w * tile->h;
sample_offset = global_work_index / tile_pixels;
pixel_offset = global_work_index - sample_offset * tile_pixels;
}
else {
/* Keeping threads for the same pixel together.
* Appears to improve performance by a few % on CUDA and OptiX. */
sample_offset = global_work_index % tile->num_samples;
pixel_offset = global_work_index / tile->num_samples;
}
uint y_offset = pixel_offset / tile->w;
uint x_offset = pixel_offset - y_offset * tile->w;
*x = tile->x + x_offset;
*y = tile->y + y_offset;
*sample = tile->start_sample + sample_offset;
}
CCL_NAMESPACE_END