diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 85da7024a2c..13fee6c02e4 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -51,7 +51,6 @@ DeviceSplitKernel::~DeviceSplitKernel()
 	delete kernel_direct_lighting;
 	delete kernel_shadow_blocked;
 	delete kernel_next_iteration_setup;
-	delete kernel_sum_all_radiance;
 }
 
 bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_features)
@@ -72,7 +71,6 @@ bool DeviceSplitKernel::load_kernels(const DeviceRequestedFeatures& requested_fe
 	LOAD_KERNEL(direct_lighting);
 	LOAD_KERNEL(shadow_blocked);
 	LOAD_KERNEL(next_iteration_setup);
-	LOAD_KERNEL(sum_all_radiance);
 
 #undef LOAD_KERNEL
 
@@ -258,15 +256,6 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 			avg_time_per_sample = alpha*time_per_sample + (1.0-alpha)*avg_time_per_sample;
 		}
 
-		size_t sum_all_radiance_local_size[2] = {16, 16};
-		size_t sum_all_radiance_global_size[2];
-		sum_all_radiance_global_size[0] = round_up(tile.w, sum_all_radiance_local_size[0]);
-		sum_all_radiance_global_size[1] = round_up(tile.h, sum_all_radiance_local_size[1]);
-
-		ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
-		                     sum_all_radiance_global_size,
-		                     sum_all_radiance_local_size);
-
 #undef ENQUEUE_SPLIT_KERNEL
 
 		tile.sample += subtile.num_samples;
diff --git a/intern/cycles/device/device_split_kernel.h b/intern/cycles/device/device_split_kernel.h
index 1903574f0b5..1c6a2709cf2 100644
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -65,7 +65,6 @@ private:
 	SplitKernelFunction *kernel_direct_lighting;
 	SplitKernelFunction *kernel_shadow_blocked;
 	SplitKernelFunction *kernel_next_iteration_setup;
-	SplitKernelFunction *kernel_sum_all_radiance;
 
 	/* Global memory variables [porting]; These memory is used for
 	 * co-operation between different kernels; Data written by one
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index d467e40b3e9..df40c3a0e8e 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -26,7 +26,6 @@ set(SRC
 	kernels/opencl/kernel_direct_lighting.cl
 	kernels/opencl/kernel_shadow_blocked.cl
 	kernels/opencl/kernel_next_iteration_setup.cl
-	kernels/opencl/kernel_sum_all_radiance.cl
 	kernels/cuda/kernel.cu
 	kernels/cuda/kernel_split.cu
 )
@@ -209,7 +208,6 @@ set(SRC_SPLIT_HEADERS
 	split/kernel_shadow_blocked.h
 	split/kernel_split_common.h
 	split/kernel_split_data.h
-	split/kernel_sum_all_radiance.h
 )
 
 # CUDA module
@@ -412,7 +410,6 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_holdout_emiss
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
-delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 8c1675665cb..deb872444d0 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -81,7 +81,6 @@ DECLARE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
 DECLARE_SPLIT_KERNEL_FUNCTION(direct_lighting)
 DECLARE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
 DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DECLARE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
 
 void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
 
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index f6e0591ef24..d6d0db4e034 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -48,7 +48,6 @@
 #  include "split/kernel_direct_lighting.h"
 #  include "split/kernel_shadow_blocked.h"
 #  include "split/kernel_next_iteration_setup.h"
-#  include "split/kernel_sum_all_radiance.h"
 #endif
 
 CCL_NAMESPACE_BEGIN
@@ -174,7 +173,6 @@ DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
 DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
 
 void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
 {
@@ -198,7 +196,6 @@ void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name,
 	REGISTER(direct_lighting);
 	REGISTER(shadow_blocked);
 	REGISTER(next_iteration_setup);
-	REGISTER(sum_all_radiance);
 
 #undef REGISTER
 #undef REGISTER_EVAL_NAME
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_split.cu b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
index 3a883265157..53a36b15e40 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_split.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel_split.cu
@@ -35,7 +35,6 @@
 #include "../../split/kernel_direct_lighting.h"
 #include "../../split/kernel_shadow_blocked.h"
 #include "../../split/kernel_next_iteration_setup.h"
-#include "../../split/kernel_sum_all_radiance.h"
 
 #include "../../kernel_film.h"
 
@@ -92,7 +91,6 @@ DEFINE_SPLIT_KERNEL_FUNCTION(holdout_emission_blurring_pathtermination_ao)
 DEFINE_SPLIT_KERNEL_FUNCTION(direct_lighting)
 DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked)
 DEFINE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
-DEFINE_SPLIT_KERNEL_FUNCTION(sum_all_radiance)
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
diff --git a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl b/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
deleted file mode 100644
index e945050a110..00000000000
--- a/intern/cycles/kernel/kernels/opencl/kernel_sum_all_radiance.cl
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel_compat_opencl.h"
-#include "split/kernel_split_common.h"
-#include "split/kernel_sum_all_radiance.h"
-
-__kernel void kernel_ocl_path_trace_sum_all_radiance(
-        KernelGlobals *kg,
-        ccl_constant KernelData *data)
-{
-	kernel_sum_all_radiance(kg);
-}
diff --git a/intern/cycles/kernel/split/kernel_background_buffer_update.h b/intern/cycles/kernel/split/kernel_background_buffer_update.h
index 07e5522c830..04aaf1bbaad 100644
--- a/intern/cycles/kernel/split/kernel_background_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_background_buffer_update.h
@@ -119,7 +119,7 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
 	ccl_global float3 *throughput = &kernel_split_state.throughput[ray_index];
 	ccl_global float *L_transparent = &kernel_split_state.L_transparent[ray_index];
 	ccl_global uint *rng = &kernel_split_state.rng[ray_index];
-	ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers;
+	ccl_global float *buffer = kernel_split_params.buffer;
 
 	unsigned int work_index;
 	ccl_global uint *initial_rng;
@@ -129,7 +129,6 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
 	unsigned int tile_y;
 	unsigned int pixel_x;
 	unsigned int pixel_y;
-	unsigned int my_sample_tile;
 
 	work_index = kernel_split_state.work_array[ray_index];
 	sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
@@ -137,11 +136,10 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
 	                        &tile_x, &tile_y,
 	                        work_index,
 	                        ray_index);
-	my_sample_tile = 0;
 	initial_rng = rng_state;
 
-	rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
-	per_sample_output_buffers += ((tile_x + (tile_y * stride)) + my_sample_tile) * kernel_data.film.pass_stride;
+	rng_state += kernel_split_params.offset + pixel_x + pixel_y*stride;
+	buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
 	if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
 		/* eval background shader if nothing hit */
@@ -165,14 +163,14 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
 
 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
 		float3 L_sum = path_radiance_clamp_and_sum(kg, L);
-		kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+		kernel_write_light_passes(kg, buffer, L, sample);
 #ifdef __KERNEL_DEBUG__
-		kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+		kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
 #endif
 		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
 
 		/* accumulate result in output buffer */
-		kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+		kernel_write_pass_float4(buffer, sample, L_rad);
 		path_rng_end(kg, rng_state, *rng);
 
 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
@@ -192,13 +190,11 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
 			sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
 			/* Get pixel and tile position associated with current work */
 			get_work_pixel_tile_position(kg, &pixel_x, &pixel_y, &tile_x, &tile_y, work_index, ray_index);
-			my_sample_tile = 0;
 
 			/* Remap rng_state according to the current work */
-			rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
-			/* Remap per_sample_output_buffers according to the current work */
-			per_sample_output_buffers = kernel_split_state.per_sample_output_buffers
-				+ ((tile_x + (tile_y * stride)) + my_sample_tile) * kernel_data.film.pass_stride;
+			rng_state = initial_rng + kernel_split_params.offset + pixel_x + pixel_y*stride;
+			/* Remap buffer according to the current work */
+			buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
 
 			/* Initialize random numbers and ray. */
 			kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
@@ -221,7 +217,7 @@ ccl_device void kernel_background_buffer_update(KernelGlobals *kg)
 				/* These rays do not participate in path-iteration. */
 				float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 				/* Accumulate result in output buffer. */
-				kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+				kernel_write_pass_float4(buffer, sample, L_rad);
 				path_rng_end(kg, rng_state, *rng);
 
 				ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
diff --git a/intern/cycles/kernel/split/kernel_data_init.h b/intern/cycles/kernel/split/kernel_data_init.h
index 982c7be2008..c22703e5abd 100644
--- a/intern/cycles/kernel/split/kernel_data_init.h
+++ b/intern/cycles/kernel/split/kernel_data_init.h
@@ -24,6 +24,21 @@ CCL_NAMESPACE_BEGIN
  * The number of elements in the queues is initialized to 0;
  */
 
+/* distributes an amount of work across all threads
+ * note: work done inside the loop may not show up to all threads till after the current kernel has completed
+ */
+#define parallel_for(kg, iter_name, work_size) \
+	for(size_t _size = (work_size), \
+	    _global_size = ccl_global_size(0) * ccl_global_size(1), \
+	    _n = _size / _global_size, \
+		_thread = ccl_global_id(0) + ccl_global_id(1) * ccl_global_size(0), \
+	    iter_name = (_n > 0) ? (_thread * _n) : (_thread) \
+		; \
+		(iter_name < (_thread+1) * _n) || (iter_name == _n * _global_size + _thread && _thread < _size % _global_size) \
+		; \
+		iter_name = (iter_name != (_thread+1) * _n - 1) ? (iter_name + 1) : (_n * _global_size + _thread) \
+	)
+
 #ifndef __KERNEL_CPU__
 ccl_device void kernel_data_init(
 #else
@@ -110,6 +125,21 @@ void KERNEL_FUNCTION_FULL_NAME(data_init)(
 		 */
 		*use_queues_flag = 0;
 	}
+
+	/* zero the tiles pixels if this is the first sample */
+	if(start_sample == 0) {
+		parallel_for(kg, i, sw * sh * kernel_data.film.pass_stride) {
+			int pixel = i / kernel_data.film.pass_stride;
+			int pass = i % kernel_data.film.pass_stride;
+
+			int x = sx + pixel % sw;
+			int y = sy + pixel / sw;
+
+			int index = (offset + x + y*stride) * kernel_data.film.pass_stride + pass;
+
+			*(buffer + index) = 0.0f;
+		}
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
index ee9c4280b22..7168efa59ae 100644
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -114,7 +114,6 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
 
 	unsigned int tile_x;
 	unsigned int tile_y;
-	int my_sample_tile;
 	unsigned int sample;
 
 	ccl_global RNG *rng = 0x0;
@@ -123,7 +122,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
 
 	ccl_global char *ray_state = kernel_split_state.ray_state;
 	ShaderData *sd = &kernel_split_state.sd[ray_index];
-	ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers;
+	ccl_global float *buffer = kernel_split_params.buffer;
 
 	if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
 
@@ -137,11 +136,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
 		                        &tile_x, &tile_y,
 		                        work_index,
 		                        ray_index);
-		my_sample_tile = 0;
 
-		per_sample_output_buffers +=
-		    ((tile_x + (tile_y * stride)) + my_sample_tile) *
-		    kernel_data.film.pass_stride;
+		buffer += (kernel_split_params.offset + pixel_x + pixel_y * stride) * kernel_data.film.pass_stride;
 
 		/* holdout */
 #ifdef __HOLDOUT__
@@ -172,7 +168,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(KernelGlobal
 		PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 		/* Holdout mask objects do not write data passes. */
 		kernel_write_data_passes(kg,
-		                         per_sample_output_buffers,
+		                         buffer,
 		                         L,
 		                         sd,
 		                         sample,
diff --git a/intern/cycles/kernel/split/kernel_path_init.h b/intern/cycles/kernel/split/kernel_path_init.h
index e613db214ed..d2e2ffaca91 100644
--- a/intern/cycles/kernel/split/kernel_path_init.h
+++ b/intern/cycles/kernel/split/kernel_path_init.h
@@ -35,7 +35,6 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
 	unsigned int pixel_y;
 	unsigned int tile_x;
 	unsigned int tile_y;
-	unsigned int my_sample_tile;
 
 	unsigned int work_index = 0;
 	/* Get work. */
@@ -49,8 +48,6 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
 	/* Get the sample associated with the work. */
 	my_sample = get_work_sample(kg, work_index, ray_index) + kernel_split_params.start_sample;
 
-	my_sample_tile = 0;
-
 	/* Get pixel and tile position associated with the work. */
 	get_work_pixel_tile_position(kg, &pixel_x, &pixel_y,
 	                        &tile_x, &tile_y,
@@ -61,9 +58,8 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
 	ccl_global uint *rng_state = kernel_split_params.rng_state;
 	rng_state += kernel_split_params.offset + pixel_x + pixel_y*kernel_split_params.stride;
 
-	ccl_global float *per_sample_output_buffers = kernel_split_state.per_sample_output_buffers;
-	per_sample_output_buffers += (tile_x + tile_y * kernel_split_params.stride + my_sample_tile)
-	                             * kernel_data.film.pass_stride;
+	ccl_global float *buffer = kernel_split_params.buffer;
+	buffer += (kernel_split_params.offset + pixel_x + pixel_y * kernel_split_params.stride) * kernel_data.film.pass_stride;
 
 	/* Initialize random numbers and ray. */
 	kernel_path_trace_setup(kg,
@@ -94,7 +90,7 @@ ccl_device void kernel_path_init(KernelGlobals *kg) {
 		/* These rays do not participate in path-iteration. */
 		float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
 		/* Accumulate result in output buffer. */
-		kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
+		kernel_write_pass_float4(buffer, my_sample, L_rad);
 		path_rng_end(kg, rng_state, kernel_split_state.rng[ray_index]);
 		ASSIGN_RAY_STATE(kernel_split_state.ray_state, ray_index, RAY_TO_REGENERATE);
 	}
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index 7e88b6f5168..5dd53f42478 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -89,7 +89,6 @@ typedef struct SplitData {
 	/* size calculation for these is non trivial, so they are left out of SPLIT_DATA_ENTRIES and handled separately */
 	ShaderData *sd;
 	ShaderData *sd_DL_shadow;
-	ccl_global float *per_sample_output_buffers;
 
 	/* this is actually in a separate buffer from the rest of the split state data (so it can be read back from
 	 * the host easily) but is still used the same as the other data so we have it here in this struct as well
@@ -113,7 +112,6 @@ ccl_device_inline size_t split_data_buffer_size(size_t num_elements,
 	 */
 	size += align_up(num_elements * SIZEOF_SD(max_closure), 16); /* sd */
 	size += align_up(2 * num_elements * SIZEOF_SD(max_closure), 16); /* sd_DL_shadow */
-	size += align_up(num_elements * per_thread_output_buffer_size, 16); /* per_sample_output_buffers */
 
 	return size;
 }
@@ -136,9 +134,6 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data,
 	split_data->sd_DL_shadow = (ShaderData*)p;
 	p += align_up(2 * num_elements * SIZEOF_SD(MAX_CLOSURE), 16);
 
-	split_data->per_sample_output_buffers = (ccl_global float*)p;
-	//p += align_up(num_elements * per_thread_output_buffer_size, 16);
-
 	split_data->ray_state = ray_state;
 }
 
diff --git a/intern/cycles/kernel/split/kernel_sum_all_radiance.h b/intern/cycles/kernel/split/kernel_sum_all_radiance.h
deleted file mode 100644
index fdceae2dafb..00000000000
--- a/intern/cycles/kernel/split/kernel_sum_all_radiance.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright 2011-2015 Blender Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-CCL_NAMESPACE_BEGIN
-
-/* Since we process various samples in parallel; The output radiance of different samples
- * are stored in different locations; This kernel combines the output radiance contributed
- * by all different samples and stores them in the RenderTile's output buffer.
- */
-
-ccl_device void kernel_sum_all_radiance(KernelGlobals *kg)
-{
-	int x = ccl_global_id(0);
-	int y = ccl_global_id(1);
-
-	ccl_global float *buffer = kernel_split_params.buffer;
-	int sw = kernel_split_params.w;
-	int sh = kernel_split_params.h;
-	int stride = kernel_split_params.stride;
-	int start_sample = kernel_split_params.start_sample;
-
-	if(x < sw && y < sh) {
-		ccl_global float *per_sample_output_buffer = kernel_split_state.per_sample_output_buffers;
-		per_sample_output_buffer += (x + y * stride) * (kernel_data.film.pass_stride);
-
-		x += kernel_split_params.x;
-		y += kernel_split_params.y;
-
-		buffer += (kernel_split_params.offset + x + y*stride) * (kernel_data.film.pass_stride);
-
-		int pass_stride_iterator = 0;
-		int num_floats = kernel_data.film.pass_stride;
-
-		for(pass_stride_iterator = 0; pass_stride_iterator < num_floats; pass_stride_iterator++) {
-			*(buffer + pass_stride_iterator) =
-			        (start_sample == 0)
-			                ? *(per_sample_output_buffer + pass_stride_iterator)
-			                : *(buffer + pass_stride_iterator) + *(per_sample_output_buffer + pass_stride_iterator);
-		}
-	}
-}
-
-CCL_NAMESPACE_END
-