diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp index 85da7024a2c..13fee6c02e4 100644 --- a/intern/cycles/device/device_split_kernel.cpp +++ b/intern/cycles/device/device_split_kernel.cpp @@ -51,7 +51,6 @@ DeviceSplitKernel::~DeviceSplitKernel() delete kernel_direct_lighting; delete kernel_shadow_blocked; delete kernel_next_iteration_setup; - delete kernel_sum_all_radiance; } bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features) @@ -72,7 +71,6 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe LOAD_KERNEL(direct_lighting); LOAD_KERNEL(shadow_blocked); LOAD_KERNEL(next_iteration_setup); - LOAD_KERNEL(sum_all_radiance); #undef LOAD_KERNEL @@ -258,15 +256,6 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task, avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample; } - size_t sum_all_radiance_local_size[2] = {16, 16}; - size_t sum_all_radiance_global_size[2]; - sum_all_radiance_global_size[0] = round_up(tile.w, sum_all_radiance_local_size[0]); - sum_all_radiance_global_size[1] = round_up(tile.h, sum_all_radiance_local_size[1]); - - ENQUEUE_SPLIT_KERNEL(sum_all_radiance, - sum_all_radiance_global_size, - sum_all_radiance_local_size); - #undef ENQUEUE_SPLIT_KERNEL tile.sample += subtile.num_samples; diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h index 1903574f0b5..1c6a2709cf2 100644 --- a/intern/cycles/device/device_split_kernel.h +++ b/intern/cycles/device/device_split_kernel.h @@ -65,7 +65,6 @@ private: SplitKernelFunction *kernel_direct_lighting; SplitKernelFunction *kernel_shadow_blocked; SplitKernelFunction *kernel_next_iteration_setup; - SplitKernelFunction *kernel_sum_all_radiance; /* Global memory variables [porting]; These memory is used for * co-operation between different kernels; Data written by one diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt index d467e40b3e9..df40c3a0e8e 100644 --- a/intern/cycles/kernel/CMakeLists.txt +++ b/intern/cycles/kernel/CMakeLists.txt @@ -26,7 +26,6 @@ set(SRC kernels/opencl/kernel_direct_lighting.cl kernels/opencl/kernel_shadow_blocked.cl kernels/opencl/kernel_next_iteration_setup.cl - kernels/opencl/kernel_sum_all_radiance.cl kernels/cuda/kernel.cu kernels/cuda/kernel_split.cu ) @@ -209,7 +208,6 @@ set(SRC_SPLIT_HEADERS split/kernel_shadow_blocked.h split/kernel_split_common.h split/kernel_split_data.h - split/kernel_sum_all_radiance.h ) # CUDA module @@ -412,7 +410,6 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emiss delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) -delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda) delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel) diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h index 8c1675665cb..deb872444d0 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h @@ -81,7 +81,6 @@ DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting) DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked) DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) -DECLARE_SPLIT_KERNEL_FUNCTION(sum_all_radiance) void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)); diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h index f6e0591ef24..d6d0db4e034 100644 --- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h +++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h @@ -48,7 +48,6 @@ # include "split/kernel_direct_lighting.h" # include "split/kernel_shadow_blocked.h" # include "split/kernel_next_iteration_setup.h" -# include "split/kernel_sum_all_radiance.h" #endif CCL_NAMESPACE_BEGIN @@ -174,7 +173,6 @@ DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked) DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) -DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance) void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func)) { @@ -198,7 +196,6 @@ void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, REGISTER(direct_lighting); REGISTER(shadow_blocked); REGISTER(next_iteration_setup); - REGISTER(sum_all_radiance); #undef REGISTER #undef REGISTER_EVAL_NAME diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu index 3a883265157..53a36b15e40 100644 --- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu +++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu @@ -35,7 +35,6 @@ #include "../../split/kernel_direct_lighting.h" #include "../../split/kernel_shadow_blocked.h" #include "../../split/kernel_next_iteration_setup.h" -#include "../../split/kernel_sum_all_radiance.h" #include "../../kernel_film.h" @@ -92,7 +91,6 @@ DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao) DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting) DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked) DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup) -DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance) extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS) diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl deleted file mode 100644 index e945050a110..00000000000 --- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel_compat_opencl.h" -#include "split/kernel_split_common.h" -#include "split/kernel_sum_all_radiance.h" - -__kernel void kernel_ocl_path_trace_sum_all_radiance( - KernelGlobals *kg, - ccl_constant KernelData *data) -{ - kernel_sum_all_radiance(kg); -} diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h index 07e5522c830..04aaf1bbaad 100644 --- a/intern/cycles/kernel/split/kernel_background_buffer_update.h +++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h @@ -119,7 +119,7 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg) ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index]; ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index]; ccl_global uint *rng = &kernel_split_state.rng[ray_index]; - ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers; + ccl_global float *buffer = kernel_split_params.buffer; unsigned int work_index; ccl_global uint *initial_rng; @@ -129,7 +129,6 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg) unsigned int tile_y; unsigned int pixel_x; unsigned int pixel_y; - unsigned int my_sample_tile; work_index = kernel_split_state.work_array[ray_index]; sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; @@ -137,11 +136,10 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg) &tile_x, &tile_y, work_index, ray_index); - my_sample_tile = 0; initial_rng = rng_state; - rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride; - per_sample_output_buffers += ((tile_x + (tile_y * stride)) + my_sample_tile) * kernel_data.film.pass_stride; + rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride; + buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) { /* eval background shader if nothing hit */ @@ -165,14 +163,14 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg) if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) { float3 L_sum = path_radiance_clamp_and_sum(kg, L); - kernel_write_light_passes(kg, per_sample_output_buffers, L, sample); + kernel_write_light_passes(kg, buffer, L, sample); #ifdef __KERNEL_DEBUG__ - kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample); + kernel_write_debug_passes(kg, buffer, state, debug_data, sample); #endif float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent)); /* accumulate result in output buffer */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + kernel_write_pass_float4(buffer, sample, L_rad); path_rng_end(kg, rng_state, *rng); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); @@ -192,13 +190,11 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg) sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; /* Get pixel and tile position associated with current work */ get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index); - my_sample_tile = 0; /* Remap rng_state according to the current work */ - rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride; - /* Remap per_sample_output_buffers according to the current work */ - per_sample_output_buffers = kernel_split_state.per_sample_output_buffers - + ((tile_x + (tile_y * stride)) + my_sample_tile) * kernel_data.film.pass_stride; + rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride; + /* Remap buffer according to the current work */ + buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride; /* Initialize random numbers and ray. */ kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray); @@ -221,7 +217,7 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg) /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ - kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad); + kernel_write_pass_float4(buffer, sample, L_rad); path_rng_end(kg, rng_state, *rng); ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE); diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h index 982c7be2008..c22703e5abd 100644 --- a/intern/cycles/kernel/split/kernel_data_init.h +++ b/intern/cycles/kernel/split/kernel_data_init.h @@ -24,6 +24,21 @@ CCL_NAMESPACE_BEGIN * The number of elements in the queues is initialized to 0; */ +/* distributes an amount of work across all threads + * note: work done inside the loop may not show up to all threads till after the current kernel has completed + */ +#define parallel_for(kg, iter_name, work_size) \ + for(size_t _size = (work_size), \ + _global_size = ccl_global_size(0) * ccl_global_size(1), \ + _n = _size / _global_size, \ + _thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \ + iter_name = (_n > 0) ? (_thread * _n) : (_thread) \ + ; \ + (iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \ + ; \ + iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \ + ) + #ifndef __KERNEL_CPU__ ccl_device void kernel_data_init( #else @@ -110,6 +125,21 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)( */ *use_queues_flag = 0; } + + /* zero the tiles pixels if this is the first sample */ + if(start_sample == 0) { + parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) { + int pixel = i / kernel_data.film.pass_stride; + int pass = i % kernel_data.film.pass_stride; + + int x = sx + pixel % sw; + int y = sy + pixel / sw; + + int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass; + + *(buffer + index) = 0.0f; + } + } } CCL_NAMESPACE_END diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h index ee9c4280b22..7168efa59ae 100644 --- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h +++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h @@ -114,7 +114,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal unsigned int tile_x; unsigned int tile_y; - int my_sample_tile; unsigned int sample; ccl_global RNG *rng = 0x0; @@ -123,7 +122,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal ccl_global char *ray_state = kernel_split_state.ray_state; ShaderData *sd = &kernel_split_state.sd[ray_index]; - ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers; + ccl_global float *buffer = kernel_split_params.buffer; if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) { @@ -137,11 +136,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal &tile_x, &tile_y, work_index, ray_index); - my_sample_tile = 0; - per_sample_output_buffers += - ((tile_x + (tile_y * stride)) + my_sample_tile) * - kernel_data.film.pass_stride; + buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride; /* holdout */ #ifdef __HOLDOUT__ @@ -172,7 +168,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal PathRadiance *L = &kernel_split_state.path_radiance[ray_index]; /* Holdout mask objects do not write data passes. */ kernel_write_data_passes(kg, - per_sample_output_buffers, + buffer, L, sd, sample, diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h index e613db214ed..d2e2ffaca91 100644 --- a/intern/cycles/kernel/split/kernel_path_init.h +++ b/intern/cycles/kernel/split/kernel_path_init.h @@ -35,7 +35,6 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { unsigned int pixel_y; unsigned int tile_x; unsigned int tile_y; - unsigned int my_sample_tile; unsigned int work_index = 0; /* Get work. */ @@ -49,8 +48,6 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { /* Get the sample associated with the work. */ my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample; - my_sample_tile = 0; - /* Get pixel and tile position associated with the work. */ get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, @@ -61,9 +58,8 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { ccl_global uint *rng_state = kernel_split_params.rng_state; rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride; - ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers; - per_sample_output_buffers += (tile_x + tile_y * kernel_split_params.stride + my_sample_tile) - * kernel_data.film.pass_stride; + ccl_global float *buffer = kernel_split_params.buffer; + buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride; /* Initialize random numbers and ray. */ kernel_path_trace_setup(kg, @@ -94,7 +90,7 @@ ccl_device void kernel_path_init(KernelGlobals *kg) { /* These rays do not participate in path-iteration. */ float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f); /* Accumulate result in output buffer. */ - kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad); + kernel_write_pass_float4(buffer, my_sample, L_rad); path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]); ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE); } diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h index 7e88b6f5168..5dd53f42478 100644 --- a/intern/cycles/kernel/split/kernel_split_data.h +++ b/intern/cycles/kernel/split/kernel_split_data.h @@ -89,7 +89,6 @@ typedef struct SplitData { /* size calculation for these is non trivial, so they are left out of SPLIT_DATA_ENTRIES and handled separately */ ShaderData *sd; ShaderData *sd_DL_shadow; - ccl_global float *per_sample_output_buffers; /* this is actually in a separate buffer from the rest of the split state data (so it can be read back from * the host easily) but is still used the same as the other data so we have it here in this struct as well @@ -113,7 +112,6 @@ ccl_device_inline size_t split_data_buffer_size(size_t num_elements, */ size += align_up(num_elements * SIZEOF_SD(max_closure), 16); /* sd */ size += align_up(2 * num_elements * SIZEOF_SD(max_closure), 16); /* sd_DL_shadow */ - size += align_up(num_elements * per_thread_output_buffer_size, 16); /* per_sample_output_buffers */ return size; } @@ -136,9 +134,6 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data, split_data->sd_DL_shadow = (ShaderData*)p; p += align_up(2 * num_elements * SIZEOF_SD(MAX_CLOSURE), 16); - split_data->per_sample_output_buffers = (ccl_global float*)p; - //p += align_up(num_elements * per_thread_output_buffer_size, 16); - split_data->ray_state = ray_state; } diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h deleted file mode 100644 index fdceae2dafb..00000000000 --- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright 2011-2015 Blender Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -CCL_NAMESPACE_BEGIN - -/* Since we process various samples in parallel; The output radiance of different samples - * are stored in different locations; This kernel combines the output radiance contributed - * by all different samples and stores them in the RenderTile's output buffer. - */ - -ccl_device void kernel_sum_all_radiance(KernelGlobals *kg) -{ - int x = ccl_global_id(0); - int y = ccl_global_id(1); - - ccl_global float *buffer = kernel_split_params.buffer; - int sw = kernel_split_params.w; - int sh = kernel_split_params.h; - int stride = kernel_split_params.stride; - int start_sample = kernel_split_params.start_sample; - - if(x < sw && y < sh) { - ccl_global float *per_sample_output_buffer = kernel_split_state.per_sample_output_buffers; - per_sample_output_buffer += (x + y * stride) * (kernel_data.film.pass_stride); - - x += kernel_split_params.x; - y += kernel_split_params.y; - - buffer += (kernel_split_params.offset + x + y*stride) * (kernel_data.film.pass_stride); - - int pass_stride_iterator = 0; - int num_floats = kernel_data.film.pass_stride; - - for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) { - *(buffer + pass_stride_iterator) = - (start_sample == 0) - ? *(per_sample_output_buffer + pass_stride_iterator) - : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator); - } - } -} - -CCL_NAMESPACE_END -