With the current code in master, scrambling distance is enabled on non-hardware accelerated ray tracing devices see a measurable performance decrease when compared scrambling distance on vs off. From testing, this performance decrease comes from the large tile sizes scheduled in `tile.cpp`. This patch attempts to address the performance decrease by using different algorithms to calculate the tile size for devices with hardware accelerated ray traversal and devices without. Large tile sizes for hardware accelerated devices and small tile sizes for others. Most of this code is based on proposals from @brecht and @leesonw Reviewed By: brecht, leesonw Differential Revision: https://developer.blender.org/D13042
56 lines
1.8 KiB
C++
56 lines
1.8 KiB
C++
/*
|
|
* Copyright 2011-2015 Blender Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
/*
|
|
* Utility functions for work stealing
|
|
*/
|
|
|
|
/* Map global work index to tile, pixel X/Y and sample. */
|
|
ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
|
|
uint global_work_index,
|
|
ccl_private uint *x,
|
|
ccl_private uint *y,
|
|
ccl_private uint *sample)
|
|
{
|
|
uint sample_offset, pixel_offset;
|
|
|
|
if (kernel_data.integrator.scrambling_distance < 0.9f) {
|
|
/* Keep threads for the same sample together. */
|
|
uint tile_pixels = tile->w * tile->h;
|
|
sample_offset = global_work_index / tile_pixels;
|
|
pixel_offset = global_work_index - sample_offset * tile_pixels;
|
|
}
|
|
else {
|
|
/* Keeping threads for the same pixel together.
|
|
* Appears to improve performance by a few % on CUDA and OptiX. */
|
|
sample_offset = global_work_index % tile->num_samples;
|
|
pixel_offset = global_work_index / tile->num_samples;
|
|
}
|
|
|
|
uint y_offset = pixel_offset / tile->w;
|
|
uint x_offset = pixel_offset - y_offset * tile->w;
|
|
|
|
*x = tile->x + x_offset;
|
|
*y = tile->y + y_offset;
|
|
*sample = tile->start_sample + sample_offset;
|
|
}
|
|
|
|
CCL_NAMESPACE_END
|