Cycles Denoising: Fix reading of the new properties

Merge remote-tracking branch 'origin/master' into temp-cycles-denoising
Cycles Denoising: Add option to store feature passes in the render result
2017-05-04 14:57:20 +02:00 · 2017-05-04 14:20:27 +02:00 · 2017-05-04 14:19:50 +02:00 · 2017-05-04 14:19:50 +02:00 · 2017-05-03 21:41:14 +02:00 · 2017-05-03 21:35:58 +02:00
117 changed files with 6458 additions and 1146 deletions
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -238,3 +238,15 @@ def register_passes(engine, scene, srl):
    if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE')
    if crl.pass_debug_bvh_intersections:       engine.register_pass(scene, srl, "Debug BVH Intersections",       1, "X", 'VALUE')
    if crl.pass_debug_ray_bounces:             engine.register_pass(scene, srl, "Debug Ray Bounces",             1, "X", 'VALUE')
+
+    if crl.use_denoising and crl.denoising_store_passes:
+        engine.register_pass(scene, srl, "Denoising Normal",          3, "XYZ", 'VECTOR');
+        engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR');
+        engine.register_pass(scene, srl, "Denoising Albedo",          3, "RGB", 'COLOR');
+        engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR');
+        engine.register_pass(scene, srl, "Denoising Depth",           1, "Z",   'VALUE');
+        engine.register_pass(scene, srl, "Denoising Depth Variance",  1, "Z",   'VALUE');
+        engine.register_pass(scene, srl, "Denoising Shadow A",        3, "XYV", 'VECTOR');
+        engine.register_pass(scene, srl, "Denoising Shadow B",        3, "XYV", 'VECTOR');
+        engine.register_pass(scene, srl, "Denoising Image",           3, "RGB", 'COLOR');
+        engine.register_pass(scene, srl, "Denoising Image Variance",  3, "RGB", 'COLOR');
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -1195,6 +1195,80 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
                default=False,
                )

+        cls.use_denoising = BoolProperty(
+                name="Use Denoising",
+                description="Denoise the rendered image",
+                default=False,
+                )
+        cls.denoising_diffuse_direct = BoolProperty(
+                name="Diffuse Direct",
+                description="Denoise the direct diffuse lighting",
+                default=True,
+                )
+        cls.denoising_diffuse_indirect = BoolProperty(
+                name="Diffuse Indirect",
+                description="Denoise the indirect diffuse lighting",
+                default=True,
+                )
+        cls.denoising_glossy_direct = BoolProperty(
+                name="Glossy Direct",
+                description="Denoise the direct glossy lighting",
+                default=True,
+                )
+        cls.denoising_glossy_indirect = BoolProperty(
+                name="Glossy Indirect",
+                description="Denoise the indirect glossy lighting",
+                default=True,
+                )
+        cls.denoising_transmission_direct = BoolProperty(
+                name="Transmission Direct",
+                description="Denoise the direct transmission lighting",
+                default=True,
+                )
+        cls.denoising_transmission_indirect = BoolProperty(
+                name="Transmission Indirect",
+                description="Denoise the indirect transmission lighting",
+                default=True,
+                )
+        cls.denoising_subsurface_direct = BoolProperty(
+                name="Subsurface Direct",
+                description="Denoise the direct subsurface lighting",
+                default=True,
+                )
+        cls.denoising_subsurface_indirect = BoolProperty(
+                name="Subsurface Indirect",
+                description="Denoise the indirect subsurface lighting",
+                default=True,
+                )
+        cls.denoising_strength = FloatProperty(
+                name="Denoising Strength",
+                description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
+                min=0.0, max=1.0,
+                default=0.5,
+                )
+        cls.denoising_feature_strength = FloatProperty(
+                name="Denoising Feature Strength",
+                description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
+                min=0.0, max=1.0,
+                default=0.5,
+                )
+        cls.denoising_radius = IntProperty(
+                name="Denoising Radius",
+                description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
+                min=1, max=50,
+                default=8,
+        )
+        cls.denoising_relative_pca = BoolProperty(
+                name="Relative filter",
+                description="When removing that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
+                default=False,
+        )
+        cls.denoising_store_passes = BoolProperty(
+                name="Store denoising passes",
+                description="Store the denoising feature passes and the noisy image",
+                default=False,
+        )
+
    @classmethod
    def unregister(cls):
        del bpy.types.SceneRenderLayer.cycles
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -530,6 +530,12 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
        col.prop(rl, "use_pass_emit", text="Emission")
        col.prop(rl, "use_pass_environment")

+        if context.scene.cycles.feature_set == 'EXPERIMENTAL':
+           col.separator()
+           sub = col.column()
+           sub.active = crl.use_denoising
+           sub.prop(crl, "denoising_store_passes", text="Denoising")
+
        if _cycles.with_cycles_debug:
          col = layout.column()
          col.prop(crl, "pass_debug_bvh_traversed_nodes")
@@ -581,6 +587,64 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
            row.prop(rv, "camera_suffix", text="")


+class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel):
+    bl_label = "Denoising"
+    bl_context = "render_layer"
+    bl_options = {'DEFAULT_CLOSED'}
+
+    def draw_header(self, context):
+        rd = context.scene.render
+        rl = rd.layers.active
+        crl = rl.cycles
+        self.layout.prop(crl, "use_denoising", text="")
+
+    def draw(self, context):
+        layout = self.layout
+
+        scene = context.scene
+        rd = scene.render
+        rl = rd.layers.active
+        crl = rl.cycles
+
+        split = layout.split()
+
+        col = split.column()
+        sub = col.column(align=True)
+        sub.prop(crl, "denoising_radius", text="Radius")
+        sub.prop(crl, "denoising_strength", slider=True, text="Strength")
+
+        col = split.column()
+        sub = col.column(align=True)
+        sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength")
+        sub.prop(crl, "denoising_relative_pca")
+
+        layout.separator()
+
+        row = layout.row()
+        row.label(text="Diffuse:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Glossy:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Transmission:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True)
+
+        row = layout.row()
+        row.label(text="Subsurface:")
+        sub = row.row(align=True)
+        sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True)
+        sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True)
+
+
 class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
    bl_label = "Post Processing"
    bl_options = {'DEFAULT_CLOSED'}
@@ -1734,6 +1798,7 @@ classes = (
    CyclesRender_PT_layer_options,
    CyclesRender_PT_layer_passes,
    CyclesRender_PT_views,
+    CyclesRender_PT_denoising,
    Cycles_PT_post_processing,
    CyclesCamera_PT_dof,
    Cycles_PT_context_material,
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -299,12 +299,13 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine,
 static void end_render_result(BL::RenderEngine& b_engine,
                              BL::RenderResult& b_rr,
                              bool cancel,
+                              bool highlight,
                              bool do_merge_results)
 {
-	b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results);
+	b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results);
 }

-void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only)
+void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight)
 {
 	BufferParams& params = rtile.buffers->params;
 	int x = params.full_x - session->tile_manager.params.full_x;
@@ -340,37 +341,37 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
 			update_render_result(b_rr, b_rlay, rtile);
 		}

-		end_render_result(b_engine, b_rr, true, true);
+		end_render_result(b_engine, b_rr, true, highlight, true);
 	}
 	else {
 		/* write result */
 		write_render_result(b_rr, b_rlay, rtile);
-		end_render_result(b_engine, b_rr, false, true);
+		end_render_result(b_engine, b_rr, false, false, true);
 	}
 }

 void BlenderSession::write_render_tile(RenderTile& rtile)
 {
-	do_write_update_render_tile(rtile, false);
+	do_write_update_render_tile(rtile, false, false);
 }

-void BlenderSession::update_render_tile(RenderTile& rtile)
+void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight)
 {
 	/* use final write for preview renders, otherwise render result wouldn't be
 	 * be updated in blender side
 	 * would need to be investigated a bit further, but for now shall be fine
 	 */
 	if(!b_engine.is_preview())
-		do_write_update_render_tile(rtile, true);
+		do_write_update_render_tile(rtile, true, highlight);
 	else
-		do_write_update_render_tile(rtile, false);
+		do_write_update_render_tile(rtile, false, false);
 }

 void BlenderSession::render()
 {
 	/* set callback to write out render results */
 	session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
-	session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1);
+	session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2);

 	/* get buffer parameters */
 	SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
@@ -391,7 +392,7 @@ void BlenderSession::render()

 		/* layer will be missing if it was disabled in the UI */
 		if(b_single_rlay == b_rr.layers.end()) {
-			end_render_result(b_engine, b_rr, true, false);
+			end_render_result(b_engine, b_rr, true, true, false);
 			continue;
 		}

@@ -407,6 +408,28 @@ void BlenderSession::render()
 		}

 		buffer_params.passes = passes;
+
+		PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
+		buffer_params.denoising_data_pass = get_boolean(crl, "use_denoising");
+		session->tile_manager.schedule_denoising = get_boolean(crl, "use_denoising");
+		session->params.use_denoising = get_boolean(crl, "use_denoising");
+		scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
+		scene->film->denoising_flags = 0;
+		if(!get_boolean(crl, "denoising_diffuse_direct"))        scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR;
+		if(!get_boolean(crl, "denoising_diffuse_indirect"))      scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND;
+		if(!get_boolean(crl, "denoising_glossy_direct"))         scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR;
+		if(!get_boolean(crl, "denoising_glossy_indirect"))       scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND;
+		if(!get_boolean(crl, "denoising_transmission_direct"))   scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR;
+		if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND;
+		if(!get_boolean(crl, "denoising_subsurface_direct"))     scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR;
+		if(!get_boolean(crl, "denoising_subsurface_indirect"))   scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND;
+		scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
+		buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass;
+		session->params.denoising_radius = get_int(crl, "denoising_radius");
+		session->params.denoising_strength = get_float(crl, "denoising_strength");
+		session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
+		session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca");
+
 		scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
 		scene->film->tag_passes_update(scene, passes);
 		scene->film->tag_update(scene);
@@ -460,7 +483,7 @@ void BlenderSession::render()
 		}

 		/* free result without merging */
-		end_render_result(b_engine, b_rr, true, false);
+		end_render_result(b_engine, b_rr, true, true, false);

 		if(session->progress.get_cancel())
 			break;
@@ -666,6 +689,12 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
 				/* copy pixels */
 				read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
 			}
+			else {
+				int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
+				if(denoising_offset >= 0) {
+					read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]);
+				}
+			}

 			if(!read) {
 				memset(&pixels[0], 0, pixels.size()*sizeof(float));
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -79,7 +79,7 @@ public:
 	void update_render_result(BL::RenderResult& b_rr,
 	                          BL::RenderLayer& b_rlay,
 	                          RenderTile& rtile);
-	void update_render_tile(RenderTile& rtile);
+	void update_render_tile(RenderTile& rtile, bool highlight);

 	/* interactive updates */
 	void synchronize();
@@ -147,7 +147,7 @@ protected:
 	                                   BL::RenderLayer& b_rlay,
 	                                   RenderTile& rtile,
 	                                   bool do_update_only);
-	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
+	void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight);

 	int builtin_image_frame(const string &builtin_name);
 	void builtin_image_info(const string &builtin_name,
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -525,6 +525,30 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
 	return PASS_NONE;
 }

+int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
+{
+	string name = b_pass.name();
+	if(name.substr(0, 10) != "Denoising ") {
+		return -1;
+	}
+	name = name.substr(10);
+
+#define MAP_PASS(passname, offset) if(name == passname) return offset;
+	MAP_PASS("Normal", DENOISING_PASS_NORMAL);
+	MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
+	MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
+	MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
+	MAP_PASS("Depth", DENOISING_PASS_DEPTH);
+	MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
+	MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
+	MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
+	MAP_PASS("Image", DENOISING_PASS_COLOR);
+	MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
+#undef MAP_PASS
+
+	return -1;
+}
+
 array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
                                            BL::SceneRenderLayer& b_srlay)
 {
@@ -544,8 +568,20 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
 			Pass::add(pass_type, passes);
 	}

-#ifdef __KERNEL_DEBUG__
 	PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
+	if(get_boolean(crp, "denoising_store_passes")) {
+		b_engine.add_pass("Denoising Normal",          3, "XYZ", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Albedo",          3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Depth",           1, "Z",   b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Depth Variance",  1, "Z",   b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Shadow A",        3, "XYV", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Shadow B",        3, "XYV", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Image",           3, "RGB", b_srlay.name().c_str());
+		b_engine.add_pass("Denoising Image Variance",  3, "RGB", b_srlay.name().c_str());
+	}
+#ifdef __KERNEL_DEBUG__
 	if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
 		b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
 		Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
--- a/intern/cycles/blender/blender_sync.h
+++ b/intern/cycles/blender/blender_sync.h
@@ -96,6 +96,7 @@ public:
 	                                      int width, int height);

 	static PassType get_pass_type(BL::RenderPass& b_pass);
+	static int get_denoising_pass(BL::RenderPass& b_pass);

 private:
 	/* sync */
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SRC
 	device.cpp
 	device_cpu.cpp
 	device_cuda.cpp
+	device_denoising.cpp
 	device_multi.cpp
 	device_opencl.cpp
 	device_split_kernel.cpp
@@ -48,6 +49,7 @@ endif()

 set(SRC_HEADERS
 	device.h
+	device_denoising.h
 	device_memory.h
 	device_intern.h
 	device_network.h
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -402,4 +402,16 @@ void Device::free_memory()
 	devices.free_memory();
 }

+
+device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type)
+ : device(device)
+{
+	ptr = device->mem_alloc_sub_ptr(mem, offset, size, type);
+}
+
+device_sub_ptr::~device_sub_ptr()
+{
+	device->mem_free_sub_ptr(ptr);
+}
+
 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -228,6 +228,7 @@ struct DeviceDrawParams {
 };

 class Device {
+	friend class device_sub_ptr;
 protected:
 	Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {}

@@ -237,6 +238,14 @@ protected:
 	/* used for real time display */
 	unsigned int vertex_buffer;

+	virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/)
+	{
+		/* Only required for devices that implement denoising. */
+		assert(false);
+		return (device_ptr) 0;
+	}
+	virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
+
 public:
 	virtual ~Device();

@@ -265,6 +274,8 @@ public:
 	virtual void mem_zero(device_memory& mem) = 0;
 	virtual void mem_free(device_memory& mem) = 0;

+	virtual int mem_address_alignment() { return 16; }
+
 	/* constant memory */
 	virtual void const_copy_to(const char *name, void *host, size_t size) = 0;

@@ -312,6 +323,8 @@ public:
 	/* multi device */
 	virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
 	virtual int device_number(Device * /*sub_device*/) { return 0; }
+	virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
+	virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}

 	/* static */
 	static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -21,11 +21,14 @@
 #include <string.h>

 #include "device/device.h"
+#include "device/device_denoising.h"
 #include "device/device_intern.h"
 #include "device/device_split_kernel.h"

 #include "render/buffers.h"

+#include "kernel/filter/filter_defines.h"
+
 #ifdef WITH_CUDA_DYNLOAD
 #  include "cuew.h"
 #else
@@ -113,7 +116,7 @@ public:
 	DedicatedTaskPool task_pool;
 	CUdevice cuDevice;
 	CUcontext cuContext;
-	CUmodule cuModule;
+	CUmodule cuModule, cuFilterModule;
 	map<device_ptr, bool> tex_interp_map;
 	map<device_ptr, uint> tex_bindless_map;
 	int cuDevId;
@@ -170,7 +173,7 @@ public:
 		CUresult result = stmt; \
 		\
 		if(result != CUDA_SUCCESS) { \
-			string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+			string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -301,7 +304,8 @@ public:
 	 * kernel sources md5 and only depends on compiler or compilation settings.
 	 */
 	string compile_kernel_get_common_cflags(
-	        const DeviceRequestedFeatures& requested_features, bool split=false)
+	        const DeviceRequestedFeatures& requested_features,
+	        bool filter=false, bool split=false)
 	{
 		const int cuda_version = cuewCompilerVersion();
 		const int machine = system_cpu_bits();
@@ -316,7 +320,7 @@ public:
 		                              machine,
 		                              cuda_version,
 		                              include_path.c_str());
-		if(use_adaptive_compilation()) {
+		if(!filter && use_adaptive_compilation()) {
 			cflags += " " + requested_features.get_build_options();
 		}
 		const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
@@ -364,8 +368,22 @@ public:
 		return true;
 	}

-	string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
+	string compile_kernel(const DeviceRequestedFeatures& requested_features,
+	                      bool filter=false, bool split=false)
 	{
+		const char *name, *source;
+		if(filter) {
+			name = "filter";
+			source = "filter.cu";
+		}
+		else if(split) {
+			name = "kernel_split";
+			source = "kernel_split.cu";
+		}
+		else {
+			name = "kernel";
+			source = "kernel.cu";
+		}
 		/* Compute cubin name. */
 		int major, minor;
 		cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
@@ -373,9 +391,8 @@ public:

 		/* Attempt to use kernel provided with Blender. */
 		if(!use_adaptive_compilation()) {
-			const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
-			                                                  : "lib/kernel_sm_%d%d.cubin",
-			                                            major, minor));
+			const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
+			                                            name, major, minor));
 			VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
 			if(path_exists(cubin)) {
 				VLOG(1) << "Using precompiled kernel.";
@@ -384,7 +401,7 @@ public:
 		}

 		const string common_cflags =
-		        compile_kernel_get_common_cflags(requested_features, split);
+		        compile_kernel_get_common_cflags(requested_features, filter, split);

 		/* Try to use locally compiled kernel. */
 		const string source_path = path_get("source");
@@ -395,9 +412,8 @@ public:
 		 */
 		const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);

-		const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
-		                                              : "cycles_kernel_sm%d%d_%s.cubin",
-		                                        major, minor,
+		const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
+		                                        name, major, minor,
 		                                        cubin_md5.c_str());
 		const string cubin = path_cache_get(path_join("kernels", cubin_file));
 		VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
@@ -432,7 +448,7 @@ public:
 		const string kernel = path_join(
 		        path_join(source_path, "kernel"),
 		        path_join("kernels",
-		                  path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
+		                  path_join("cuda", source)));
 		double starttime = time_dt();
 		printf("Compiling CUDA kernel ...\n");

@@ -480,11 +496,14 @@ public:
 			return false;

 		/* get kernel */
-		string cubin = compile_kernel(requested_features, use_split_kernel());
-
+		string cubin = compile_kernel(requested_features, false, use_split_kernel());
 		if(cubin == "")
 			return false;

+		string filter_cubin = compile_kernel(requested_features, true, false);
+		if(filter_cubin == "")
+			return false;
+
 		/* open module */
 		cuda_push_context();

@@ -499,6 +518,14 @@ public:
 		if(cuda_error_(result, "cuModuleLoad"))
 			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));

+		if(path_read_text(filter_cubin, cubin_data))
+			result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+		else
+			result = CUDA_ERROR_FILE_NOT_FOUND;
+
+		if(cuda_error_(result, "cuModuleLoad"))
+			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
 		cuda_pop_context();

 		return (result == CUDA_SUCCESS);
@@ -581,6 +608,11 @@ public:
 		}
 	}

+	virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
+	{
+		return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
+	}
+
 	void const_copy_to(const char *name, void *host, size_t size)
 	{
 		CUdeviceptr mem;
@@ -881,6 +913,368 @@ public:
 		}
 	}

+	bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
+	{
+		mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
+
+		TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
+		for(int i = 0; i < 9; i++) {
+			tiles->buffers[i] = buffers[i];
+		}
+
+		mem_copy_to(task->tiles_mem);
+
+		return !have_error();
+	}
+
+#define CUDA_GET_BLOCKSIZE(func, w, h)                                                                          \
+			int threads_per_block;                                                                              \
+			cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+			int threads = (int)sqrt((float)threads_per_block);                                                  \
+			int xblocks = ((w) + threads - 1)/threads;                                                          \
+			int yblocks = ((h) + threads - 1)/threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args)                      \
+			cuda_assert(cuLaunchKernel(func,                \
+			                           xblocks, yblocks, 1, \
+			                           threads, threads, 1, \
+			                           0, 0, args, 0));
+
+	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
+	                               DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		int4 rect = task->rect;
+		int w = rect.z-rect.x;
+		int h = rect.w-rect.y;
+		int r = task->nlm_state.r;
+		int f = task->nlm_state.f;
+		float a = task->nlm_state.a;
+		float k_2 = task->nlm_state.k_2;
+
+		CUdeviceptr difference     = task->nlm_state.temporary_1_ptr;
+		CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
+		CUdeviceptr weightAccum    = task->nlm_state.temporary_3_ptr;
+
+		cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
+		cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+
+		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+		cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+		cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+		cuda_assert(cuModuleGetFunction(&cuNLMNormalize,      cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize,      CU_FUNC_CACHE_PREFER_L1));
+
+		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+
+		int dx, dy;
+		int4 local_rect;
+		int channel_offset = 0;
+		void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
+		void *blur_args[]            = {&difference, &blurDifference, &local_rect, &w, &f};
+		void *calc_weight_args[]     = {&blurDifference, &difference, &local_rect, &w, &f};
+		void *update_output_args[]   = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
+
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			dy = i / (2*r+1) - r;
+			dx = i % (2*r+1) - r;
+			local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+
+			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+			CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
+		}
+
+		local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
+		void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
+		CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_construct_transform(DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterConstructTransform;
+		cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+		CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
+		                   task->storage.w,
+		                   task->storage.h);
+
+		void *args[] = {&task->buffer.mem.device_pointer,
+		                &task->storage.transform.device_pointer,
+		                &task->storage.rank.device_pointer,
+		                &task->filter_area,
+		                &task->rect,
+		                &task->radius,
+		                &task->pca_threshold,
+		                &task->buffer.pass_stride};
+		CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr guide_ptr,
+	                           device_ptr guide_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		mem_zero(task->storage.XtWX);
+		mem_zero(task->storage.XtWY);
+
+		cuda_push_context();
+
+		CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+		cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+		cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+		cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+		cuda_assert(cuModuleGetFunction(&cuFinalize,            cuFilterModule, "kernel_cuda_filter_finalize"));
+
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
+		cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+		cuda_assert(cuFuncSetCacheConfig(cuFinalize,            CU_FUNC_CACHE_PREFER_L1));
+
+		CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
+		                   task->reconstruction_state.source_w,
+		                   task->reconstruction_state.source_h);
+
+		CUdeviceptr difference     = task->reconstruction_state.temporary_1_ptr;
+		CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+
+		int r = task->radius;
+		int f = 4;
+		float a = 1.0f;
+		for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+			int dy = i / (2*r+1) - r;
+			int dx = i % (2*r+1) - r;
+
+			int local_rect[4] = {max(0, -dx), max(0, -dy),
+			                     task->reconstruction_state.source_w - max(0, dx),
+			                     task->reconstruction_state.source_h - max(0, dy)};
+
+			void *calc_difference_args[] = {&dx, &dy,
+			                                &guide_ptr,
+			                                &guide_variance_ptr,
+			                                &difference,
+			                                &local_rect,
+			                                &task->buffer.w,
+			                                &task->buffer.pass_stride,
+			                                &a,
+			                                &task->nlm_k_2};
+			CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
+
+			void *blur_args[] = {&difference,
+			                     &blurDifference,
+			                     &local_rect,
+			                     &task->buffer.w,
+			                     &f};
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+			void *calc_weight_args[] = {&blurDifference,
+			                            &difference,
+			                            &local_rect,
+			                            &task->buffer.w,
+			                            &f};
+			CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
+
+			/* Reuse previous arguments. */
+			CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
+
+			void *construct_gramian_args[] = {&dx, &dy,
+			                                  &blurDifference,
+			                                  &task->buffer.mem.device_pointer,
+			                                  &color_ptr,
+			                                  &color_variance_ptr,
+			                                  &task->storage.transform.device_pointer,
+			                                  &task->storage.rank.device_pointer,
+			                                  &task->storage.XtWX.device_pointer,
+			                                  &task->storage.XtWY.device_pointer,
+			                                  &local_rect,
+			                                  &task->reconstruction_state.filter_rect,
+			                                  &task->buffer.w,
+			                                  &task->buffer.h,
+			                                  &f,
+		                                      &task->buffer.pass_stride};
+			CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
+		}
+
+		void *finalize_args[] = {&task->buffer.w,
+		                         &task->buffer.h,
+		                         &output_ptr,
+				                 &task->storage.rank.device_pointer,
+				                 &task->storage.XtWX.device_pointer,
+				                 &task->storage.XtWY.device_pointer,
+				                 &task->filter_area,
+				                 &task->reconstruction_state.buffer_params.x,
+				                 &task->render_buffer.samples};
+		CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
+	                              device_ptr mean_ptr, device_ptr variance_ptr,
+	                              int r, int4 rect, DenoisingTask *task)
+	{
+		(void) task;
+
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterCombineHalves;
+		cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		void *args[] = {&mean_ptr,
+		                &variance_ptr,
+		                &a_ptr,
+		                &b_ptr,
+		                &rect,
+		                &r};
+		CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
+	{
+		(void) task;
+
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterDivideShadow;
+		cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		bool use_split_variance = use_split_kernel();
+		void *args[] = {&task->render_buffer.samples,
+		                &task->tiles_mem.device_pointer,
+		                &a_ptr,
+		                &b_ptr,
+		                &sample_variance_ptr,
+		                &sv_variance_ptr,
+		                &buffer_variance_ptr,
+		                &task->rect,
+		                &task->render_buffer.pass_stride,
+		                &task->render_buffer.denoising_data_offset,
+		                &use_split_variance};
+		CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task)
+	{
+		if(have_error())
+			return false;
+
+		cuda_push_context();
+
+		CUfunction cuFilterGetFeature;
+		cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+		cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+		CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
+		                   task->rect.z-task->rect.x,
+		                   task->rect.w-task->rect.y);
+
+		bool use_split_variance = use_split_kernel();
+		void *args[] = {&task->render_buffer.samples,
+		                &task->tiles_mem.device_pointer,
+				        &mean_offset,
+				        &variance_offset,
+		                &mean_ptr,
+		                &variance_ptr,
+		                &task->rect,
+		                &task->render_buffer.pass_stride,
+		                &task->render_buffer.denoising_data_offset,
+		                &use_split_variance};
+		CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+		cuda_assert(cuCtxSynchronize());
+
+		cuda_pop_context();
+		return !have_error();
+	}
+
+	void denoise(RenderTile &rtile, const DeviceTask &task)
+	{
+		DenoisingTask denoising(this);
+
+		denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
+		denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+		denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+		denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+		denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
+
+		denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+		denoising.render_buffer.samples = rtile.sample;
+
+		RenderTile rtiles[9];
+		rtiles[4] = rtile;
+		task.map_neighbor_tiles(rtiles, this);
+		denoising.tiles_from_rendertiles(rtiles);
+
+		denoising.init_from_devicetask(task);
+
+		denoising.run_denoising();
+
+		task.unmap_neighbor_tiles(rtiles, this);
+	}
+
 	void path_trace(RenderTile& rtile, int sample, bool branched)
 	{
 		if(have_error())
@@ -1305,7 +1699,7 @@ public:

 	void thread_run(DeviceTask *task)
 	{
-		if(task->type == DeviceTask::PATH_TRACE) {
+		if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;

 			bool branched = task->integrator_branched;
@@ -1313,30 +1707,8 @@ public:
 			/* Upload Bindless Mapping */
 			load_bindless_mapping();

-			if(!use_split_kernel()) {
-				/* keep rendering tiles until done */
-				while(task->acquire_tile(this, tile)) {
-					int start_sample = tile.start_sample;
-					int end_sample = tile.start_sample + tile.num_samples;
-
-					for(int sample = start_sample; sample < end_sample; sample++) {
-						if(task->get_cancel()) {
-							if(task->need_finish_queue == false)
-								break;
-						}
-
-						path_trace(tile, sample, branched);
-
-						tile.sample = sample + 1;
-
-						task->update_progress(&tile, tile.w*tile.h);
-					}
-
-					task->release_tile(tile);
-				}
-			}
-			else {
-				DeviceRequestedFeatures requested_features;
+			DeviceRequestedFeatures requested_features;
+			if(use_split_kernel()) {
 				if(!use_adaptive_compilation()) {
 					requested_features.max_closure = 64;
 				}
@@ -1345,17 +1717,46 @@ public:
 					split_kernel = new CUDASplitKernel(this);
 					split_kernel->load_kernels(requested_features);
 				}
+			}

-				while(task->acquire_tile(this, tile)) {
-					device_memory void_buffer;
-					split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-
-					task->release_tile(tile);
-
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
+			/* keep rendering tiles until done */
+			while(task->acquire_tile(this, tile)) {
+				if(tile.task == RenderTile::PATH_TRACE) {
+					if(use_split_kernel()) {
+						device_memory void_buffer;
+						split_kernel->path_trace(task, tile, void_buffer, void_buffer);
 					}
+					else {
+						int start_sample = tile.start_sample;
+						int end_sample = tile.start_sample + tile.num_samples;
+
+						for(int sample = start_sample; sample < end_sample; sample++) {
+							if(task->get_cancel()) {
+								if(task->need_finish_queue == false)
+									break;
+							}
+
+							path_trace(tile, sample, branched);
+
+							tile.sample = sample + 1;
+
+							task->update_progress(&tile, tile.w*tile.h);
+						}
+					}
+				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+
+					denoise(tile, *task);
+
+					task->update_progress(&tile, tile.w*tile.h);
+				}
+
+				task->release_tile(tile);
+
+				if(task->get_cancel()) {
+					if(task->need_finish_queue == false)
+						break;
 				}
 			}
 		}
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/device_denoising.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+void DenoisingTask::init_from_devicetask(const DeviceTask &task)
+{
+	radius = task.denoising_radius;
+	nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength));
+	if(task.denoising_relative_pca) {
+		pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength));
+	}
+	else {
+		pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
+	}
+
+	render_buffer.pass_stride = task.pass_stride;
+	render_buffer.denoising_data_offset  = task.pass_denoising_data;
+	render_buffer.denoising_clean_offset = task.pass_denoising_clean;
+
+	/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
+	rect = make_int4(max(tiles->x[0], filter_area.x - radius),
+	                 max(tiles->y[0], filter_area.y - radius),
+	                 min(tiles->x[3], filter_area.x + filter_area.z + radius),
+	                 min(tiles->y[3], filter_area.y + filter_area.w + radius));
+}
+
+void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
+{
+	tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int));
+
+	device_ptr buffers[9];
+	for(int i = 0; i < 9; i++) {
+		buffers[i] = rtiles[i].buffer;
+		tiles->offsets[i] = rtiles[i].offset;
+		tiles->strides[i] = rtiles[i].stride;
+	}
+	tiles->x[0] = rtiles[3].x;
+	tiles->x[1] = rtiles[4].x;
+	tiles->x[2] = rtiles[5].x;
+	tiles->x[3] = rtiles[5].x + rtiles[5].w;
+	tiles->y[0] = rtiles[1].y;
+	tiles->y[1] = rtiles[4].y;
+	tiles->y[2] = rtiles[7].y;
+	tiles->y[3] = rtiles[7].y + rtiles[7].h;
+
+	render_buffer.offset = rtiles[4].offset;
+	render_buffer.stride = rtiles[4].stride;
+	render_buffer.ptr    = rtiles[4].buffer;
+
+	functions.set_tiles(buffers);
+}
+
+bool DenoisingTask::run_denoising()
+{
+	/* Allocate denoising buffer. */
+	buffer.passes = 14;
+	buffer.w = align_up(rect.z - rect.x, 4);
+	buffer.h = rect.w - rect.y;
+	buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
+	buffer.mem.resize(buffer.pass_stride * buffer.passes);
+	device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
+
+	device_ptr null_ptr = (device_ptr) 0;
+
+	/* Prefilter shadow feature. */
+	{
+		device_sub_ptr unfiltered_a   (device, buffer.mem, 0,                    buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr unfiltered_b   (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr sample_var     (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr buffer_var     (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr filtered_var   (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+		nlm_state.temporary_1_ptr = *nlm_temporary_1;
+		nlm_state.temporary_2_ptr = *nlm_temporary_2;
+		nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+		/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
+		functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
+
+		/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
+		nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
+		functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
+
+		/* Reuse memory, the previous data isn't needed anymore. */
+		device_ptr filtered_a = *buffer_var,
+		           filtered_b = *sample_var;
+		/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
+		nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
+		functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
+		functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
+
+		device_ptr residual_var = *sample_var_var;
+		/* Estimate the residual variance between the two filtered halves. */
+		functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
+
+		device_ptr final_a = *unfiltered_a,
+		           final_b = *unfiltered_b;
+		/* Use the residual variance for a second filter pass. */
+		nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
+		functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
+		functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
+
+		/* Combine the two double-filtered halves to a final shadow feature. */
+		device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
+	}
+
+	/* Prefilter general features. */
+	{
+		device_sub_ptr unfiltered     (device, buffer.mem,  8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr variance       (device, buffer.mem,  9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+
+		nlm_state.temporary_1_ptr = *nlm_temporary_1;
+		nlm_state.temporary_2_ptr = *nlm_temporary_2;
+		nlm_state.temporary_3_ptr = *nlm_temporary_3;
+
+		int mean_from[]     = { 0, 1, 2, 6,  7,  8, 12 };
+		int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
+		int pass_to[]       = { 1, 2, 3, 0,  5,  6,  7 };
+		for(int pass = 0; pass < 7; pass++) {
+			device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+			/* Get the unfiltered pass and its variance from the RenderBuffers. */
+			functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
+			/* Smooth the pass and store the result in the denoising buffers. */
+			nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
+			functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
+		}
+	}
+
+	/* Copy color passes. */
+	{
+		int mean_from[]     = {20, 21, 22};
+		int variance_from[] = {23, 24, 25};
+		int mean_to[]       = { 8,  9, 10};
+		int variance_to[]   = {11, 12, 13};
+		int num_color_passes = 3;
+		for(int pass = 0; pass < num_color_passes; pass++) {
+			device_sub_ptr color_pass    (device, buffer.mem,     mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+			device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
+			functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
+		}
+	}
+
+	storage.w = filter_area.z;
+	storage.h = filter_area.w;
+	storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE);
+	storage.rank.resize(storage.w*storage.h);
+	device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE);
+	device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE);
+
+	functions.construct_transform();
+
+	device_only_memory<float> temporary_1;
+	device_only_memory<float> temporary_2;
+	temporary_1.resize(buffer.w*buffer.h);
+	temporary_2.resize(buffer.w*buffer.h);
+	device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE);
+	device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE);
+	reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
+	reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
+
+	storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE);
+	storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE);
+	device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE);
+	device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE);
+
+	reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
+	int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
+	reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
+	                                               render_buffer.stride,
+	                                               render_buffer.pass_stride,
+	                                               render_buffer.denoising_clean_offset);
+	reconstruction_state.source_w = rect.z-rect.x;
+	reconstruction_state.source_h = rect.w-rect.y;
+
+	{
+		device_sub_ptr color_ptr    (device, buffer.mem,  8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+		device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
+		functions.reconstruct(*color_ptr, *color_var_ptr, *color_ptr, *color_var_ptr, render_buffer.ptr);
+	}
+
+	device->mem_free(storage.XtWX);
+	device->mem_free(storage.XtWY);
+	device->mem_free(storage.transform);
+	device->mem_free(storage.rank);
+	device->mem_free(temporary_1);
+	device->mem_free(temporary_2);
+	device->mem_free(buffer.mem);
+	device->mem_free(tiles_mem);
+	return true;
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __DEVICE_DENOISING_H__
+#define __DEVICE_DENOISING_H__
+
+#include "device/device.h"
+
+#include "render/buffers.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+class DenoisingTask {
+public:
+	/* Parameters of the denoising algorithm. */
+	int radius;
+	float nlm_k_2;
+	float pca_threshold;
+
+	/* Pointer and parameters of the RenderBuffers. */
+	struct RenderBuffers {
+		int denoising_data_offset;
+		int denoising_clean_offset;
+		int pass_stride;
+		int offset;
+		int stride;
+		device_ptr ptr;
+		int samples;
+	} render_buffer;
+
+	TilesInfo *tiles;
+	device_vector<int> tiles_mem;
+	void tiles_from_rendertiles(RenderTile *rtiles);
+
+	int4 rect;
+	int4 filter_area;
+
+	struct DeviceFunctions {
+		function<bool(device_ptr image_ptr,    /* Contains the values that are smoothed. */
+		              device_ptr guide_ptr,    /* Contains the values that are used to calculate weights. */
+		              device_ptr variance_ptr, /* Contains the variance of the guide image. */
+		              device_ptr out_ptr       /* The filtered output is written into this image. */
+		              )> non_local_means;
+		function<bool(device_ptr color_ptr,
+		              device_ptr color_variance_ptr,
+		              device_ptr guide_ptr,
+		              device_ptr guide_variance_ptr,
+		              device_ptr output_ptr
+		              )> reconstruct;
+		function<bool()> construct_transform;
+
+		function<bool(device_ptr a_ptr,
+		              device_ptr b_ptr,
+		              device_ptr mean_ptr,
+		              device_ptr variance_ptr,
+		              int r,
+		              int4 rect
+		              )> combine_halves;
+		function<bool(device_ptr a_ptr,
+		              device_ptr b_ptr,
+		              device_ptr sample_variance_ptr,
+		              device_ptr sv_variance_ptr,
+		              device_ptr buffer_variance_ptr
+		              )> divide_shadow;
+		function<bool(int mean_offset,
+		              int variance_offset,
+		              device_ptr mean_ptr,
+		              device_ptr variance_ptr
+		              )> get_feature;
+		function<bool(device_ptr*)> set_tiles;
+	} functions;
+
+	/* Stores state of the current Reconstruction operation,
+	 * which is accessed by the device in order to perform the operation. */
+	struct ReconstructionState {
+		device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
+		device_ptr temporary_2_ptr;
+
+		int4 filter_rect;
+		int4 buffer_params;
+
+		int source_w;
+		int source_h;
+	} reconstruction_state;
+
+	/* Stores state of the current NLM operation,
+	 * which is accessed by the device in order to perform the operation. */
+	struct NLMState {
+		device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */
+		device_ptr temporary_2_ptr;
+		device_ptr temporary_3_ptr;
+
+		int r;      /* Search radius of the filter. */
+		int f;      /* Patch size of the filter. */
+		float a;    /* Variance compensation factor in the MSE estimation. */
+		float k_2;  /* Squared value of the k parameter of the filter. */
+
+		void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
+	} nlm_state;
+
+	struct Storage {
+		device_only_memory<float>  transform;
+		device_only_memory<int>    rank;
+		device_only_memory<float>  XtWX;
+		device_only_memory<float3> XtWY;
+		int w;
+		int h;
+	} storage;
+
+	DenoisingTask(Device *device) : device(device) {}
+
+	void init_from_devicetask(const DeviceTask &task);
+
+	bool run_denoising();
+
+	struct DenoiseBuffers {
+		int pass_stride;
+		int passes;
+		int w;
+		int h;
+		device_only_memory<float> mem;
+	} buffer;
+
+protected:
+	Device *device;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* __DEVICE_DENOISING_H__ */
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -35,6 +35,8 @@

 CCL_NAMESPACE_BEGIN

+class Device;
+
 enum MemoryType {
 	MEM_READ_ONLY,
 	MEM_WRITE_ONLY,
@@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> {

 template<> struct device_type_traits<float3> {
 	static const DataType data_type = TYPE_FLOAT;
-	static const int num_elements = 3;
+	static const int num_elements = 4;
 };

 template<> struct device_type_traits<float4> {
@@ -173,6 +175,9 @@ class device_memory
 {
 public:
 	size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
+	size_t memory_elements_size(int elements) {
+		return elements*data_elements*datatype_size(data_type);
+	}

 	/* data information */
 	DataType data_type;
@@ -213,6 +218,22 @@ protected:
 	device_memory& operator = (const device_memory&);
 };

+template<typename T>
+class device_only_memory : public device_memory
+{
+public:
+	device_only_memory()
+	{
+		data_type = device_type_traits<T>::data_type;
+		data_elements = max(device_type_traits<T>::num_elements, 1);
+	}
+
+	void resize(size_t num)
+	{
+		device_memory::resize(num*sizeof(T));
+	}
+};
+
 /* Device Vector */

 template<typename T> class device_vector : public device_memory
@@ -299,6 +320,27 @@ private:
 	array<T> data;
 };

+/* A device_sub_ptr is a pointer into another existing memory.
+ * Therefore, it is not allocated separately, but just created from the already allocated base memory.
+ * It is freed automatically when it goes out of scope, which should happen before the base memory is freed.
+ * Note that some devices require the offset and size of the sub_ptr to be properly aligned. */
+class device_sub_ptr
+{
+public:
+	device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type);
+	~device_sub_ptr();
+	/* No copying. */
+	device_sub_ptr& operator = (const device_sub_ptr&);
+
+	device_ptr operator*() const
+	{
+		return ptr;
+	}
+protected:
+	Device *device;
+	device_ptr ptr;
+};
+
 CCL_NAMESPACE_END

 #endif /* __DEVICE_MEMORY_H__ */
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -299,6 +299,60 @@ public:
 		return -1;
 	}

+	void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
+	{
+		for(int i = 0; i < 9; i++) {
+			if(!tiles[i].buffers) {
+				continue;
+			}
+			/* If the tile was rendered on another device, copy its memory to
+			 * to the current device now, for the duration of the denoising task.
+			 * Note that this temporarily modifies the RenderBuffers and calls
+			 * the device, so this function is not thread safe. */
+			if(tiles[i].buffers->device != sub_device) {
+				device_vector<float> &mem = tiles[i].buffers->buffer;
+
+				tiles[i].buffers->copy_from_device();
+				device_ptr original_ptr = mem.device_pointer;
+				mem.device_pointer = 0;
+				sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE);
+				sub_device->mem_copy_to(mem);
+				tiles[i].buffer = mem.device_pointer;
+				mem.device_pointer = original_ptr;
+			}
+		}
+	}
+
+	void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
+	{
+		for(int i = 0; i < 9; i++) {
+			if(!tiles[i].buffers) {
+				continue;
+			}
+			if(tiles[i].buffers->device != sub_device) {
+				device_vector<float> &mem = tiles[i].buffers->buffer;
+
+				device_ptr original_ptr = mem.device_pointer;
+				mem.device_pointer = tiles[i].buffer;
+
+				/* Copy denoised tile to the host device. */
+				if(i == 4) {
+					tiles[i].buffers->copy_from_device(sub_device);
+				}
+
+				size_t mem_size = mem.device_size;
+				sub_device->mem_free(mem);
+				mem.device_pointer = original_ptr;
+				mem.device_size = mem_size;
+
+				/* Copy denoised tile to the original device. */
+				if(i == 4) {
+					tiles[i].buffers->device->mem_copy_to(mem);
+				}
+			}
+		}
+	}
+
 	int get_split_task_count(DeviceTask& task)
 	{
 		int total_tasks = 0;
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -166,13 +166,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 		unsigned int max_work_groups = num_global_elements / work_pool_size + 1;

 		/* Allocate work_pool_wgs memory. */
-		work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
+		work_pool_wgs.resize(max_work_groups);
 		device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);

-		queue_index.resize(NUM_QUEUES * sizeof(int));
+		queue_index.resize(NUM_QUEUES);
 		device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);

-		use_queues_flag.resize(sizeof(char));
+		use_queues_flag.resize(1);
 		device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);

 		ray_state.resize(num_global_elements);
--- a/intern/cycles/device/device_split_kernel.h
+++ b/intern/cycles/device/device_split_kernel.h
@@ -80,16 +80,16 @@ private:
 	 */
 	device_memory split_data;
 	device_vector<uchar> ray_state;
-	device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
+	device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */

 	/* Flag to make sceneintersect and lampemission kernel use queues. */
-	device_memory use_queues_flag;
+	device_only_memory<char> use_queues_flag;

 	/* Approximate time it takes to complete one sample */
 	double avg_time_per_sample;

 	/* Work pool with respect to each work group. */
-	device_memory work_pool_wgs;
+	device_only_memory<unsigned int> work_pool_wgs;

 	/* clos_max value for which the kernels have been loaded currently. */
 	int current_max_closure;
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
 	if(type == SHADER) {
 		num = min(shader_w, num);
 	}
-	else if(type == PATH_TRACE) {
+	else if(type == RENDER) {
 	}
 	else {
 		num = min(h, num);
@@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
 			tasks.push_back(task);
 		}
 	}
-	else if(type == PATH_TRACE) {
+	else if(type == RENDER) {
 		for(int i = 0; i < num; i++)
 			tasks.push_back(*this);
 	}
@@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)

 void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
 {
-	if((type != PATH_TRACE) &&
+	if((type != RENDER) &&
 	   (type != SHADER))
 		return;

--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -34,7 +34,7 @@ class Tile;

 class DeviceTask : public Task {
 public:
-	typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
+	typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
 	Type type;

 	int x, y, w, h;
@@ -53,7 +53,7 @@ public:

 	int passes_size;

-	explicit DeviceTask(Type type = PATH_TRACE);
+	explicit DeviceTask(Type type = RENDER);

 	int get_subtask_count(int num, int max_size = 0);
 	void split(list<DeviceTask>& tasks, int num, int max_size = 0);
@@ -65,6 +65,16 @@ public:
 	function<void(RenderTile&)> update_tile_sample;
 	function<void(RenderTile&)> release_tile;
 	function<bool(void)> get_cancel;
+	function<void(RenderTile*, Device*)> map_neighbor_tiles;
+	function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
+
+	int denoising_radius;
+	float denoising_strength;
+	float denoising_feature_strength;
+	bool denoising_relative_pca;
+	int pass_stride;
+	int pass_denoising_data;
+	int pass_denoising_clean;

 	bool need_finish_queue;
 	bool integrator_branched;
--- a/intern/cycles/device/opencl/opencl.h
+++ b/intern/cycles/device/opencl/opencl.h
@@ -17,6 +17,7 @@
 #ifdef WITH_OPENCL

 #include "device/device.h"
+#include "device/device_denoising.h"

 #include "util/util_map.h"
 #include "util/util_param.h"
@@ -129,6 +130,8 @@ public:
 	                            cl_int* error = NULL);
 	static cl_device_type get_device_type(cl_device_id device_id);

+	static int mem_address_alignment(cl_device_id device_id);
+
 	/* Get somewhat more readable device name.
 	 * Main difference is AMD OpenCL here which only gives code name
 	 * for the regular device name. This will give more sane device
@@ -218,7 +221,7 @@ public:
 		cl_int err = stmt; \
 		\
 		if(err != CL_SUCCESS) { \
-			string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
+			string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
 			if(error_msg == "") \
 				error_msg = message; \
 			fprintf(stderr, "%s\n", message.c_str()); \
@@ -282,7 +285,7 @@ public:
 		map<ustring, cl_kernel> kernels;
 	};

-	OpenCLProgram base_program;
+	OpenCLProgram base_program, denoising_program;

 	typedef map<string, device_vector<uchar>*> ConstMemMap;
 	typedef map<string, device_ptr> MemMap;
@@ -320,6 +323,9 @@ public:
 	void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
 	void mem_zero(device_memory& mem);
 	void mem_free(device_memory& mem);
+
+	int mem_address_alignment();
+
 	void const_copy_to(const char *name, void *host, size_t size);
 	void tex_alloc(const char *name,
 	               device_memory& mem,
@@ -328,12 +334,14 @@ public:
 	void tex_free(device_memory& mem);

 	size_t global_size_round_up(int group_size, int global_size);
-	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
+	void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
 	void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);

 	void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
 	void shader(DeviceTask& task);

+	void denoise(RenderTile& tile, const DeviceTask& task);
+
 	class OpenCLDeviceTask : public DeviceTask {
 	public:
 		OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
@@ -367,9 +375,48 @@ public:

 	virtual void thread_run(DeviceTask * /*task*/) = 0;

+	virtual bool is_split_kernel() = 0;
+
 protected:
 	string kernel_build_options(const string *debug_src = NULL);

+	void mem_zero_kernel(device_ptr ptr, size_t size);
+
+	bool denoising_non_local_means(device_ptr image_ptr,
+	                               device_ptr guide_ptr,
+	                               device_ptr variance_ptr,
+	                               device_ptr out_ptr,
+	                               DenoisingTask *task);
+	bool denoising_construct_transform(DenoisingTask *task);
+	bool denoising_reconstruct(device_ptr color_ptr,
+	                           device_ptr color_variance_ptr,
+	                           device_ptr guide_ptr,
+	                           device_ptr guide_variance_ptr,
+	                           device_ptr output_ptr,
+	                           DenoisingTask *task);
+	bool denoising_combine_halves(device_ptr a_ptr,
+	                              device_ptr b_ptr,
+	                              device_ptr mean_ptr,
+	                              device_ptr variance_ptr,
+	                              int r, int4 rect,
+	                              DenoisingTask *task);
+	bool denoising_divide_shadow(device_ptr a_ptr,
+	                             device_ptr b_ptr,
+	                             device_ptr sample_variance_ptr,
+	                             device_ptr sv_variance_ptr,
+	                             device_ptr buffer_variance_ptr,
+	                             DenoisingTask *task);
+	bool denoising_get_feature(int mean_offset,
+	                           int variance_offset,
+	                           device_ptr mean_ptr,
+	                           device_ptr variance_ptr,
+	                           DenoisingTask *task);
+	bool denoising_set_tiles(device_ptr *buffers,
+	                         DenoisingTask *task);
+
+	device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type);
+	void mem_free_sub_ptr(device_ptr ptr);
+
 	class ArgumentWrapper {
 	public:
 		ArgumentWrapper() : size(0), pointer(NULL)
--- a/intern/cycles/device/opencl/opencl_base.cpp
+++ b/intern/cycles/device/opencl/opencl_base.cpp
@@ -213,8 +213,23 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
 	base_program.add_kernel(ustring("bake"));
 	base_program.add_kernel(ustring("zero_buffer"));

+	denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
+	denoising_program.add_kernel(ustring("filter_divide_shadow"));
+	denoising_program.add_kernel(ustring("filter_get_feature"));
+	denoising_program.add_kernel(ustring("filter_combine_halves"));
+	denoising_program.add_kernel(ustring("filter_construct_transform"));
+	denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
+	denoising_program.add_kernel(ustring("filter_nlm_blur"));
+	denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
+	denoising_program.add_kernel(ustring("filter_nlm_update_output"));
+	denoising_program.add_kernel(ustring("filter_nlm_normalize"));
+	denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
+	denoising_program.add_kernel(ustring("filter_finalize"));
+	denoising_program.add_kernel(ustring("filter_set_tiles"));
+
 	vector<OpenCLProgram*> programs;
 	programs.push_back(&base_program);
+	programs.push_back(&denoising_program);
 	/* Call actual class to fill the vector with its programs. */
 	if(!load_kernels(requested_features, programs)) {
 		return false;
@@ -322,37 +337,42 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
 	                                  NULL, NULL));
 }

+void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
+{
+	cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
+
+	size_t global_size[] = {1024, 1024};
+	size_t num_threads = global_size[0] * global_size[1];
+
+	cl_mem d_buffer = CL_MEM_PTR(mem);
+	cl_ulong d_offset = 0;
+	cl_ulong d_size = 0;
+
+	while(d_offset < size) {
+		d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
+
+		kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
+
+		ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
+		                               ckZeroBuffer,
+		                               2,
+		                               NULL,
+		                               global_size,
+		                               NULL,
+		                               0,
+		                               NULL,
+		                               NULL);
+		opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
+
+		d_offset += d_size;
+	}
+}
+
 void OpenCLDeviceBase::mem_zero(device_memory& mem)
 {
 	if(mem.device_pointer) {
 		if(base_program.is_loaded()) {
-			cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
-
-			size_t global_size[] = {1024, 1024};
-			size_t num_threads = global_size[0] * global_size[1];
-
-			cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
-			cl_ulong d_offset = 0;
-			cl_ulong d_size = 0;
-
-			while(d_offset < mem.memory_size()) {
-				d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
-
-				kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
-
-				ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
-				                               ckZeroBuffer,
-				                               2,
-				                               NULL,
-				                               global_size,
-				                               NULL,
-				                               0,
-				                               NULL,
-				                               NULL);
-				opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
-
-				d_offset += d_size;
-			}
+			mem_zero_kernel(mem.device_pointer, mem.memory_size());
 		}

 		if(mem.data_pointer) {
@@ -396,6 +416,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
 	}
 }

+int OpenCLDeviceBase::mem_address_alignment()
+{
+	return OpenCLInfo::mem_address_alignment(cdDevice);
+}
+
+device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type)
+{
+	cl_mem_flags mem_flag;
+	if(type == MEM_READ_ONLY)
+		mem_flag = CL_MEM_READ_ONLY;
+	else if(type == MEM_WRITE_ONLY)
+		mem_flag = CL_MEM_WRITE_ONLY;
+	else
+		mem_flag = CL_MEM_READ_WRITE;
+
+	cl_buffer_region info;
+	info.origin = mem.memory_elements_size(offset);
+	info.size = mem.memory_elements_size(size);
+
+	device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
+	                                                    mem_flag,
+	                                                    CL_BUFFER_CREATE_TYPE_REGION,
+	                                                    &info,
+	                                                    &ciErr);
+	opencl_assert_err(ciErr, "clCreateSubBuffer");
+	return sub_buf;
+}
+
+void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
+{
+	if(device_pointer && device_pointer != null_mem) {
+		opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
+	}
+}
+
 void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
 {
 	ConstMemMap::iterator i = const_mem_map.find(name);
@@ -449,7 +504,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
 	return global_size + ((r == 0)? 0: group_size - r);
 }

-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
 {
 	size_t workgroup_size, max_work_items[3];

@@ -458,6 +513,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
 	clGetDeviceInfo(cdDevice,
 		CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);

+	if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
+		workgroup_size = max_workgroup_size;
+	}
+
 	/* Try to divide evenly over 2 dimensions. */
 	size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
 	size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
@@ -543,6 +602,362 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
 	enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
 }

+bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
+                                                 device_ptr guide_ptr,
+                                                 device_ptr variance_ptr,
+                                                 device_ptr out_ptr,
+                                                 DenoisingTask *task)
+{
+	int4 rect = task->rect;
+	int w = rect.z-rect.x;
+	int h = rect.w-rect.y;
+	int r = task->nlm_state.r;
+	int f = task->nlm_state.f;
+	float a = task->nlm_state.a;
+	float k_2 = task->nlm_state.k_2;
+
+	cl_mem difference     = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
+	cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
+	cl_mem weightAccum    = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+	cl_mem image_mem = CL_MEM_PTR(image_ptr);
+	cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+	cl_mem out_mem = CL_MEM_PTR(out_ptr);
+
+	mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
+	mem_zero_kernel(out_ptr, sizeof(float)*w*h);
+
+	cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
+	cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
+	cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
+	cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
+	cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
+
+	for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+		int dy = i / (2*r+1) - r;
+		int dx = i % (2*r+1) - r;
+		int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
+		kernel_set_args(ckNLMCalcDifference, 0,
+		                dx, dy, guide_mem, variance_mem,
+		                difference, local_rect, w, 0, a, k_2);
+		kernel_set_args(ckNLMBlur, 0,
+		                difference, blurDifference, local_rect, w, f);
+		kernel_set_args(ckNLMCalcWeight, 0,
+		                blurDifference, difference, local_rect, w, f);
+		kernel_set_args(ckNLMUpdateOutput, 0,
+		                dx, dy, blurDifference, image_mem,
+		                out_mem, weightAccum, local_rect, w, f);
+
+		enqueue_kernel(ckNLMCalcDifference, w, h);
+		enqueue_kernel(ckNLMBlur,           w, h);
+		enqueue_kernel(ckNLMCalcWeight,     w, h);
+		enqueue_kernel(ckNLMBlur,           w, h);
+		enqueue_kernel(ckNLMUpdateOutput,   w, h);
+	}
+
+	int4 local_rect = make_int4(0, 0, w, h);
+	kernel_set_args(ckNLMNormalize, 0,
+	                out_mem, weightAccum, local_rect, w);
+	enqueue_kernel(ckNLMNormalize, w, h);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
+{
+	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+	cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+	cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+
+	cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
+
+	kernel_set_args(ckFilterConstructTransform, 0,
+	                buffer_mem,
+	                transform_mem,
+	                rank_mem,
+	                task->filter_area,
+	                task->rect,
+	                task->buffer.pass_stride,
+	                task->radius,
+	                task->pca_threshold);
+
+	enqueue_kernel(ckFilterConstructTransform,
+	               task->storage.w,
+	               task->storage.h,
+	               256);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
+                                             device_ptr color_variance_ptr,
+                                             device_ptr guide_ptr,
+                                             device_ptr guide_variance_ptr,
+                                             device_ptr output_ptr,
+                                             DenoisingTask *task)
+{
+	mem_zero(task->storage.XtWX);
+	mem_zero(task->storage.XtWY);
+
+	cl_mem color_mem = CL_MEM_PTR(color_ptr);
+	cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
+	cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
+	cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr);
+	cl_mem output_mem = CL_MEM_PTR(output_ptr);
+
+	cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
+	cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
+	cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
+	cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
+	cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
+
+	cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
+	cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
+	cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
+	cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
+	cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
+
+	cl_mem difference     = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
+	cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
+
+	int r = task->radius;
+	int f = 4;
+	float a = 1.0f;
+	for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
+		int dy = i / (2*r+1) - r;
+		int dx = i % (2*r+1) - r;
+
+		int local_rect[4] = {max(0, -dx), max(0, -dy),
+		                     task->reconstruction_state.source_w - max(0, dx),
+		                     task->reconstruction_state.source_h - max(0, dy)};
+
+		kernel_set_args(ckNLMCalcDifference, 0,
+		                dx, dy,
+		                guide_mem,
+		                guide_variance_mem,
+		                difference,
+		                local_rect,
+		                task->buffer.w,
+		                task->buffer.pass_stride,
+		                a, task->nlm_k_2);
+		enqueue_kernel(ckNLMCalcDifference,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		kernel_set_args(ckNLMBlur, 0,
+		                difference,
+		                blurDifference,
+		                local_rect,
+		                task->buffer.w,
+		                f);
+		enqueue_kernel(ckNLMBlur,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		kernel_set_args(ckNLMCalcWeight, 0,
+		                blurDifference,
+		                difference,
+		                local_rect,
+		                task->buffer.w,
+		                f);
+		enqueue_kernel(ckNLMCalcWeight,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		/* Reuse previous arguments. */
+		enqueue_kernel(ckNLMBlur,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h);
+
+		kernel_set_args(ckNLMConstructGramian, 0,
+		                dx, dy,
+		                blurDifference,
+		                buffer_mem,
+		                color_mem,
+		                color_variance_mem,
+		                transform_mem,
+		                rank_mem,
+		                XtWX_mem,
+		                XtWY_mem,
+		                local_rect,
+		                task->reconstruction_state.filter_rect,
+		                task->buffer.w,
+		                task->buffer.h,
+		                f,
+	                    task->buffer.pass_stride);
+		enqueue_kernel(ckNLMConstructGramian,
+		               task->reconstruction_state.source_w,
+		               task->reconstruction_state.source_h,
+		               256);
+	}
+
+	kernel_set_args(ckFinalize, 0,
+	                task->buffer.w,
+	                task->buffer.h,
+	                output_mem,
+	                rank_mem,
+	                XtWX_mem,
+	                XtWY_mem,
+	                task->filter_area,
+	                task->reconstruction_state.buffer_params,
+	                task->render_buffer.samples);
+	enqueue_kernel(ckFinalize,
+	               task->reconstruction_state.source_w,
+	               task->reconstruction_state.source_h);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
+                                                device_ptr b_ptr,
+                                                device_ptr mean_ptr,
+                                                device_ptr variance_ptr,
+                                                int r, int4 rect,
+                                                DenoisingTask *task)
+{
+	(void) task;
+
+	cl_mem a_mem = CL_MEM_PTR(a_ptr);
+	cl_mem b_mem = CL_MEM_PTR(b_ptr);
+	cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+	cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
+
+	kernel_set_args(ckFilterCombineHalves, 0,
+	                mean_mem,
+	                variance_mem,
+	                a_mem,
+	                b_mem,
+	                rect,
+	                r);
+	enqueue_kernel(ckFilterCombineHalves,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
+                                               device_ptr b_ptr,
+                                               device_ptr sample_variance_ptr,
+                                               device_ptr sv_variance_ptr,
+                                               device_ptr buffer_variance_ptr,
+                                               DenoisingTask *task)
+{
+	(void) task;
+
+	cl_mem a_mem = CL_MEM_PTR(a_ptr);
+	cl_mem b_mem = CL_MEM_PTR(b_ptr);
+	cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
+	cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
+	cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
+
+	char split_kernel = is_split_kernel()? 1 : 0;
+	kernel_set_args(ckFilterDivideShadow, 0,
+	                task->render_buffer.samples,
+	                tiles_mem,
+	                a_mem,
+	                b_mem,
+	                sample_variance_mem,
+	                sv_variance_mem,
+	                buffer_variance_mem,
+	                task->rect,
+	                task->render_buffer.pass_stride,
+	                task->render_buffer.denoising_data_offset,
+	                split_kernel);
+	enqueue_kernel(ckFilterDivideShadow,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
+                                             int variance_offset,
+                                             device_ptr mean_ptr,
+                                             device_ptr variance_ptr,
+                                             DenoisingTask *task)
+{
+	cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
+	cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
+
+	char split_kernel = is_split_kernel()? 1 : 0;
+	kernel_set_args(ckFilterGetFeature, 0,
+	                task->render_buffer.samples,
+	                tiles_mem,
+	                mean_offset,
+	                variance_offset,
+	                mean_mem,
+	                variance_mem,
+	                task->rect,
+	                task->render_buffer.pass_stride,
+	                task->render_buffer.denoising_data_offset,
+	                split_kernel);
+	enqueue_kernel(ckFilterGetFeature,
+	               task->rect.z-task->rect.x,
+	               task->rect.w-task->rect.y);
+
+	return true;
+}
+
+bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
+                                           DenoisingTask *task)
+{
+	mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE);
+	mem_copy_to(task->tiles_mem);
+
+	cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
+
+	cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
+
+	kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
+	for(int i = 0; i < 9; i++) {
+		cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
+		kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
+	}
+
+	enqueue_kernel(ckFilterSetTiles, 1, 1);
+
+	return true;
+}
+
+void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
+{
+	DenoisingTask denoising(this);
+
+	denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
+	denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
+	denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
+	denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+	denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+	denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+	denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
+
+	denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+	denoising.render_buffer.samples = rtile.sample;
+
+	RenderTile rtiles[9];
+	rtiles[4] = rtile;
+	task.map_neighbor_tiles(rtiles, this);
+	denoising.tiles_from_rendertiles(rtiles);
+
+	denoising.init_from_devicetask(task);
+
+	denoising.run_denoising();
+
+	task.unmap_neighbor_tiles(rtiles, this);
+}
+
 void OpenCLDeviceBase::shader(DeviceTask& task)
 {
 	/* cast arguments to cl types */
--- a/intern/cycles/device/opencl/opencl_mega.cpp
+++ b/intern/cycles/device/opencl/opencl_mega.cpp
@@ -108,41 +108,53 @@ public:
 		else if(task->type == DeviceTask::SHADER) {
 			shader(*task);
 		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
+		else if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;
 			/* Keep rendering tiles until done. */
 			while(task->acquire_tile(this, tile)) {
-				int start_sample = tile.start_sample;
-				int end_sample = tile.start_sample + tile.num_samples;
+				if(tile.task == RenderTile::PATH_TRACE) {
+					int start_sample = tile.start_sample;
+					int end_sample = tile.start_sample + tile.num_samples;

-				for(int sample = start_sample; sample < end_sample; sample++) {
-					if(task->get_cancel()) {
-						if(task->need_finish_queue == false)
-							break;
+					for(int sample = start_sample; sample < end_sample; sample++) {
+						if(task->get_cancel()) {
+							if(task->need_finish_queue == false)
+								break;
+						}
+
+						path_trace(tile, sample);
+
+						tile.sample = sample + 1;
+
+						task->update_progress(&tile, tile.w*tile.h);
 					}

-					path_trace(tile, sample);
-
-					tile.sample = sample + 1;
-
+					/* Complete kernel execution before release tile */
+					/* This helps in multi-device render;
+					 * The device that reaches the critical-section function
+					 * release_tile waits (stalling other devices from entering
+					 * release_tile) for all kernels to complete. If device1 (a
+					 * slow-render device) reaches release_tile first then it would
+					 * stall device2 (a fast-render device) from proceeding to render
+					 * next tile.
+					 */
+					clFinish(cqCommandQueue);
+				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+					denoise(tile, *task);
 					task->update_progress(&tile, tile.w*tile.h);
 				}

-				/* Complete kernel execution before release tile */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
-
 				task->release_tile(tile);
 			}
 		}
 	}
+
+	bool is_split_kernel()
+	{
+		return false;
+	}
 };

 Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
--- a/intern/cycles/device/opencl/opencl_split.cpp
+++ b/intern/cycles/device/opencl/opencl_split.cpp
@@ -100,7 +100,7 @@ public:
 		else if(task->type == DeviceTask::SHADER) {
 			shader(*task);
 		}
-		else if(task->type == DeviceTask::PATH_TRACE) {
+		else if(task->type == DeviceTask::RENDER) {
 			RenderTile tile;

 			/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
@@ -123,21 +123,29 @@ public:

 			/* Keep rendering tiles until done. */
 			while(task->acquire_tile(this, tile)) {
-				split_kernel->path_trace(task,
-				                         tile,
-				                         kgbuffer,
-				                         *const_mem_map["__data"]);
+				if(tile.task == RenderTile::PATH_TRACE) {
+					assert(tile.task == RenderTile::PATH_TRACE);
+					split_kernel->path_trace(task,
+					                         tile,
+					                         kgbuffer,
+					                         *const_mem_map["__data"]);

-				/* Complete kernel execution before release tile. */
-				/* This helps in multi-device render;
-				 * The device that reaches the critical-section function
-				 * release_tile waits (stalling other devices from entering
-				 * release_tile) for all kernels to complete. If device1 (a
-				 * slow-render device) reaches release_tile first then it would
-				 * stall device2 (a fast-render device) from proceeding to render
-				 * next tile.
-				 */
-				clFinish(cqCommandQueue);
+					/* Complete kernel execution before release tile. */
+					/* This helps in multi-device render;
+					 * The device that reaches the critical-section function
+					 * release_tile waits (stalling other devices from entering
+					 * release_tile) for all kernels to complete. If device1 (a
+					 * slow-render device) reaches release_tile first then it would
+					 * stall device2 (a fast-render device) from proceeding to render
+					 * next tile.
+					 */
+					clFinish(cqCommandQueue);
+				}
+				else if(tile.task == RenderTile::DENOISE) {
+					tile.sample = tile.start_sample + tile.num_samples;
+					denoise(tile, *task);
+					task->update_progress(&tile, tile.w*tile.h);
+				}

 				task->release_tile(tile);
 			}
@@ -146,6 +154,11 @@ public:
 		}
 	}

+	bool is_split_kernel()
+	{
+		return true;
+	}
+
 protected:
 	/* ** Those guys are for workign around some compiler-specific bugs ** */

--- a/intern/cycles/device/opencl/opencl_util.cpp
+++ b/intern/cycles/device/opencl/opencl_util.cpp
@@ -1073,6 +1073,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
 	return get_device_name(device_id);
 }

+int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
+{
+	int base_align_bits;
+	if(clGetDeviceInfo(device_id,
+	                   CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+	                   sizeof(int),
+	                   &base_align_bits,
+	                   NULL) == CL_SUCCESS)
+	{
+		return base_align_bits/8;
+	}
+	return 1;
+}
+
 CCL_NAMESPACE_END

 #endif
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -10,7 +10,23 @@ set(INC_SYS

 set(SRC
 	kernels/cpu/kernel.cpp
+	kernels/cpu/kernel_sse2.cpp
+	kernels/cpu/kernel_sse3.cpp
+	kernels/cpu/kernel_sse41.cpp
+	kernels/cpu/kernel_avx.cpp
+	kernels/cpu/kernel_avx2.cpp
 	kernels/cpu/kernel_split.cpp
+	kernels/cpu/kernel_split_sse2.cpp
+	kernels/cpu/kernel_split_sse3.cpp
+	kernels/cpu/kernel_split_sse41.cpp
+	kernels/cpu/kernel_split_avx.cpp
+	kernels/cpu/kernel_split_avx2.cpp
+	kernels/cpu/filter.cpp
+	kernels/cpu/filter_sse2.cpp
+	kernels/cpu/filter_sse3.cpp
+	kernels/cpu/filter_sse41.cpp
+	kernels/cpu/filter_avx.cpp
+	kernels/cpu/filter_avx2.cpp
 	kernels/opencl/kernel.cl
 	kernels/opencl/kernel_state_buffer_size.cl
 	kernels/opencl/kernel_split.cl
@@ -32,8 +48,10 @@ set(SRC
 	kernels/opencl/kernel_next_iteration_setup.cl
 	kernels/opencl/kernel_indirect_subsurface.cl
 	kernels/opencl/kernel_buffer_update.cl
+	kernels/opencl/filter.cl
 	kernels/cuda/kernel.cu
 	kernels/cuda/kernel_split.cu
+	kernels/cuda/filter.cu
 )

 set(SRC_BVH_HEADERS
@@ -95,6 +113,8 @@ set(SRC_KERNELS_CPU_HEADERS
 	kernels/cpu/kernel_cpu.h
 	kernels/cpu/kernel_cpu_impl.h
 	kernels/cpu/kernel_cpu_image.h
+	kernels/cpu/filter_cpu.h
+	kernels/cpu/filter_cpu_impl.h
 )

 set(SRC_KERNELS_CUDA_HEADERS
@@ -190,6 +210,21 @@ set(SRC_GEOM_HEADERS
 	geom/geom_volume.h
 )

+set(SRC_FILTER_HEADERS
+	filter/filter.h
+	filter/filter_defines.h
+	filter/filter_features.h
+	filter/filter_features_sse.h
+	filter/filter_kernel.h
+	filter/filter_nlm_cpu.h
+	filter/filter_nlm_gpu.h
+	filter/filter_prefilter.h
+	filter/filter_reconstruction.h
+	filter/filter_transform.h
+	filter/filter_transform_gpu.h
+	filter/filter_transform_sse.h
+)
+
 set(SRC_UTIL_HEADERS
 	../util/util_atomic.h
 	../util/util_color.h
@@ -204,6 +239,7 @@ set(SRC_UTIL_HEADERS
 	../util/util_math_int2.h
 	../util/util_math_int3.h
 	../util/util_math_int4.h
+	../util/util_math_matrix.h
 	../util/util_static_assert.h
 	../util/util_transform.h
 	../util/util_texture.h
@@ -295,23 +331,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		${SRC_CLOSURE_HEADERS}
 		${SRC_UTIL_HEADERS}
 	)
+	set(cuda_filter_sources kernels/cuda/filter.cu
+		${SRC_HEADERS}
+		${SRC_KERNELS_CUDA_HEADERS}
+		${SRC_FILTER_HEADERS}
+		${SRC_UTIL_HEADERS}
+	)
 	set(cuda_cubins)

-	macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
-		if(${split})
-			set(cuda_extra_flags "-D__SPLIT__")
-			set(cuda_cubin kernel_split)
-		else()
-			set(cuda_extra_flags "")
-			set(cuda_cubin kernel)
-		endif()
-
+	macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
 		if(${experimental})
-			set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
-			set(cuda_cubin ${cuda_cubin}_experimental)
+			set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
+			set(name ${name}_experimental)
 		endif()

-		set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
+		set(cuda_cubin ${name}_${arch}.cubin)

 		if(WITH_CYCLES_DEBUG)
 			set(cuda_debug_flags "-D__KERNEL_DEBUG__")
@@ -325,11 +359,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
 		set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
 		set(cuda_math_flags "--use_fast_math")

-		if(split)
-			set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
-		else()
-			set(cuda_kernel_src "/kernels/cuda/kernel.cu")
-		endif()
+		set(cuda_kernel_src "/kernels/cuda/${name}.cu")

 		add_custom_command(
 			OUTPUT ${cuda_cubin}
@@ -343,13 +373,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
 					${cuda_arch_flags}
 					${cuda_version_flags}
 					${cuda_math_flags}
-					${cuda_extra_flags}
+					${flags}
 					${cuda_debug_flags}
 					-I${CMAKE_CURRENT_SOURCE_DIR}/..
 					-DCCL_NAMESPACE_BEGIN=
 					-DCCL_NAMESPACE_END=
 					-DNVCC
-			DEPENDS ${cuda_sources})
+			DEPENDS ${sources})

 		delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
 		list(APPEND cuda_cubins ${cuda_cubin})
@@ -363,11 +393,12 @@ if(WITH_CYCLES_CUDA_BINARIES)

 	foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
 		# Compile regular kernel
-		CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
+		CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)

 		if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
 			# Compile split kernel
-			CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
+			CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
 		endif()
 	endforeach()

@@ -388,41 +419,30 @@ include_directories(SYSTEM ${INC_SYS})

 set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
 set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
+set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")

 if(CXX_HAS_SSE)
-	list(APPEND SRC
-		kernels/cpu/kernel_sse2.cpp
-		kernels/cpu/kernel_sse3.cpp
-		kernels/cpu/kernel_sse41.cpp
-		kernels/cpu/kernel_split_sse2.cpp
-		kernels/cpu/kernel_split_sse3.cpp
-		kernels/cpu/kernel_split_sse41.cpp
-	)
-
 	set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
 endif()

 if(CXX_HAS_AVX)
-	list(APPEND SRC
-		kernels/cpu/kernel_avx.cpp
-		kernels/cpu/kernel_split_avx.cpp
-	)
 	set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
 endif()

 if(CXX_HAS_AVX2)
-	list(APPEND SRC
-		kernels/cpu/kernel_avx2.cpp
-		kernels/cpu/kernel_split_avx2.cpp
-	)
 	set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 	set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
+	set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
 endif()

 add_library(cycles_kernel
@@ -432,6 +452,7 @@ add_library(cycles_kernel
 	${SRC_KERNELS_CUDA_HEADERS}
 	${SRC_BVH_HEADERS}
 	${SRC_CLOSURE_HEADERS}
+	${SRC_FILTER_HEADERS}
 	${SRC_SVM_HEADERS}
 	${SRC_GEOM_HEADERS}
 	${SRC_SPLIT_HEADERS}
@@ -472,12 +493,15 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -435,5 +435,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
 #endif
 }

+/* Classifies a closure as diffuse-like or specular-like.
+ * This is needed for the denoising feature pass generation,
+ * which are written on the first bounce where more than 25%
+ * of the sampling weight belongs to diffuse-line closures. */
+ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
+{
+	if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+		return true;
+	}
+
+	if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
+		MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
+		return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
+	}
+
+	return false;
+}
+
 CCL_NAMESPACE_END

--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_velvet.h
@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {

 	float sigma;
 	float invsigma2;
-	float3 N;
 } VelvetBsdf;

 ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
--- a/intern/cycles/kernel/closure/bsdf_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse.h
@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN

 typedef ccl_addr_space struct DiffuseBsdf {
 	SHADER_CLOSURE_BASE;
-	float3 N;
 } DiffuseBsdf;

 /* DIFFUSE */
--- a/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_diffuse_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct DiffuseRampBsdf {
 	SHADER_CLOSURE_BASE;

-	float3 N;
 	float3 *colors;
 } DiffuseRampBsdf;

--- a/intern/cycles/kernel/closure/bsdf_microfacet.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
@@ -46,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
 	float alpha_x, alpha_y, ior;
 	MicrofacetExtra *extra;
 	float3 T;
-	float3 N;
 } MicrofacetBsdf;

 /* Beckmann and GGX microfacet importance sampling. */
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi.h
@@ -42,7 +42,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
 /* Sample slope distribution (based on page 14 of the supplemental implementation). */
 ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
 {
-	if(cosI > 0.9999f || cosI < 1e-6f) {
+	if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
 		const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
 		const float phi = M_2PI_F * randU.y;
 		return make_float2(r*cosf(phi), r*sinf(phi));
--- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
+++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct OrenNayarBsdf {
 	SHADER_CLOSURE_BASE;

-	float3 N;
 	float roughness;
 	float a;
 	float b;
--- a/intern/cycles/kernel/closure/bsdf_phong_ramp.h
+++ b/intern/cycles/kernel/closure/bsdf_phong_ramp.h
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct PhongRampBsdf {
 	SHADER_CLOSURE_BASE;

-	float3 N;
 	float exponent;
 	float3 *colors;
 } PhongRampBsdf;
--- a/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_diffuse.h
@@ -28,7 +28,6 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf {
 	SHADER_CLOSURE_BASE;

 	float roughness;
-	float3 N;
 } PrincipledDiffuseBsdf;

 ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
--- a/intern/cycles/kernel/closure/bsdf_principled_sheen.h
+++ b/intern/cycles/kernel/closure/bsdf_principled_sheen.h
@@ -26,7 +26,6 @@ CCL_NAMESPACE_BEGIN

 typedef ccl_addr_space struct PrincipledSheenBsdf {
 	SHADER_CLOSURE_BASE;
-	float3 N;
 } PrincipledSheenBsdf;

 ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
--- a/intern/cycles/kernel/closure/bsdf_toon.h
+++ b/intern/cycles/kernel/closure/bsdf_toon.h
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
 typedef ccl_addr_space struct ToonBsdf {
 	SHADER_CLOSURE_BASE;

-	float3 N;
 	float size;
 	float smooth;
 } ToonBsdf;
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -28,7 +28,6 @@ typedef ccl_addr_space struct Bssrdf {
 	float texture_blur;
 	float albedo;
 	float roughness;
-	float3 N;
 } Bssrdf;

 /* Planar Truncated Gaussian
--- a/intern/cycles/kernel/filter/filter.h
+++ b/intern/cycles/kernel/filter/filter.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_H__
+#define __FILTER_H__
+
+/* CPU Filter Kernel Interface */
+
+#include "util/util_types.h"
+
+#include "kernel/filter/filter_defines.h"
+
+CCL_NAMESPACE_BEGIN
+
+#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
+#define KERNEL_NAME_EVAL(arch, name)  KERNEL_NAME_JOIN(kernel, arch, name)
+#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
+
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu.h"
+
+CCL_NAMESPACE_END
+
+#endif /* __FILTER_H__ */
--- a/intern/cycles/kernel/filter/filter_defines.h
+++ b/intern/cycles/kernel/filter/filter_defines.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __FILTER_DEFINES_H__
+#define __FILTER_DEFINES_H__
+
+#define DENOISE_FEATURES 10
+#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
+#define XTWX_SIZE      (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
+#define XTWY_SIZE      (DENOISE_FEATURES+1)
+
+typedef struct TilesInfo {
+	int offsets[9];
+	int strides[9];
+	int x[4];
+	int y[4];
+	/* TODO(lukas): CUDA doesn't have uint64_t... */
+#ifdef __KERNEL_OPENCL__
+	ccl_global float *buffers[9];
+#else
+	long long int buffers[9];
+#endif
+} TilesInfo;
+
+#endif /* __FILTER_DEFINES_H__*/
--- a/intern/cycles/kernel/filter/filter_features.h
+++ b/intern/cycles/kernel/filter/filter_features.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
+ * pixel_buffer always points to the current pixel in the first pass. */
+#define FOR_PIXEL_WINDOW     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+                             for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                 for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
+
+#define END_FOR_PIXEL_WINDOW     } \
+                                 pixel_buffer += buffer_w - (high.x - low.x); \
+                             }
+
+ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride)
+{
+	features[0] = pixel.x;
+	features[1] = pixel.y;
+	features[2] = ccl_get_feature(buffer, 0);
+	features[3] = ccl_get_feature(buffer, 1);
+	features[4] = ccl_get_feature(buffer, 2);
+	features[5] = ccl_get_feature(buffer, 3);
+	features[6] = ccl_get_feature(buffer, 4);
+	features[7] = ccl_get_feature(buffer, 5);
+	features[8] = ccl_get_feature(buffer, 6);
+	features[9] = ccl_get_feature(buffer, 7);
+	if(mean) {
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			features[i] -= mean[i];
+	}
+}
+
+ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride)
+{
+	scales[0] = fabsf(pixel.x - mean[0]);
+	scales[1] = fabsf(pixel.y - mean[1]);
+	scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
+	scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
+	                                    ccl_get_feature(buffer, 2) - mean[4],
+	                                    ccl_get_feature(buffer, 3) - mean[5]));
+	scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
+	scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
+	                                    ccl_get_feature(buffer, 6) - mean[8],
+	                                    ccl_get_feature(buffer, 7) - mean[9]));
+}
+
+ccl_device_inline void filter_calculate_scale(float *scale)
+{
+	scale[0] = 1.0f/max(scale[0], 0.01f);
+	scale[1] = 1.0f/max(scale[1], 0.01f);
+	scale[2] = 1.0f/max(scale[2], 0.01f);
+	scale[6] = 1.0f/max(scale[4], 0.01f);
+	scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
+	scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
+}
+
+ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
+{
+	return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
+}
+
+ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
+{
+	return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
+}
+
+ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_variance, float3 center_color, float sqrt_center_variance)
+{
+	float color_diff = average(fabs(pixel_color - center_color));
+	float variance = sqrt_center_variance + sqrtf(pixel_variance) + 0.005f;
+	return (color_diff > 3.0f*variance);
+}
+
+ccl_device_inline void design_row_add(float *design_row,
+                                      int rank,
+                                      ccl_global float ccl_restrict_ptr transform,
+                                      int stride,
+                                      int row,
+                                      float feature)
+{
+	for(int i = 0; i < rank; i++) {
+		design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
+	}
+}
+
+/* Fill the design row. */
+ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
+                                                       ccl_global float ccl_restrict_ptr p_buffer,
+                                                       int2 q_pixel,
+                                                       ccl_global float ccl_restrict_ptr q_buffer,
+                                                       int pass_stride,
+                                                       int rank,
+                                                       float *design_row,
+                                                       ccl_global float ccl_restrict_ptr transform,
+                                                       int stride)
+{
+	design_row[0] = 1.0f;
+	math_vector_zero(design_row+1, rank);
+	design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
+	design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
+	design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
+	design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
+	design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
+	design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
+	design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
+	design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
+	design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
+	design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/filter/filter_features_sse.h
+++ b/intern/cycles/kernel/filter/filter_features_sse.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
+
+/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
+ * pixel_buffer always points to the first of the 4 current pixel in the first pass.
+ * x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
+
+#define FOR_PIXEL_WINDOW_SSE     pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
+                                 for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
+                                     __m128 y4 = _mm_set1_ps(pixel.y); \
+                                     for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
+                                         __m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
+                                         __m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
+
+#define END_FOR_PIXEL_WINDOW_SSE     } \
+                                     pixel_buffer += buffer_w - (pixel.x - low.x); \
+                                 }
+
+ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
+{
+	features[0] = x;
+	features[1] = y;
+	features[2] = ccl_get_feature_sse(0);
+	features[3] = ccl_get_feature_sse(1);
+	features[4] = ccl_get_feature_sse(2);
+	features[5] = ccl_get_feature_sse(3);
+	features[6] = ccl_get_feature_sse(4);
+	features[7] = ccl_get_feature_sse(5);
+	features[8] = ccl_get_feature_sse(6);
+	features[9] = ccl_get_feature_sse(7);
+	if(mean) {
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			features[i] = _mm_sub_ps(features[i], mean[i]);
+	}
+	for(int i = 0; i < DENOISE_FEATURES; i++)
+		features[i] = _mm_mask_ps(features[i], active_pixels);
+}
+
+ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
+{
+	scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
+	scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
+
+	scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels);
+
+	__m128 diff, scale;
+	diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
+	scale = _mm_mul_ps(diff, diff);
+	diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	scales[3] = _mm_mask_ps(scale, active_pixels);
+
+	scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
+
+	diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
+	scale = _mm_mul_ps(diff, diff);
+	diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
+	scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
+	scales[5] = _mm_mask_ps(scale, active_pixels);
+}
+
+ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
+{
+	scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
+	scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
+	scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
+	scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
+
+	scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
+	scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
+}
+
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/filter/filter_kernel.h
+++ b/intern/cycles/kernel/filter/filter_kernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/util_color.h"
+#include "util/util_math.h"
+#include "util/util_math_fast.h"
+#include "util/util_texture.h"
+
+#include "util/util_atomic.h"
+#include "util/util_math_matrix.h"
+
+#include "kernel/filter/filter_defines.h"
+
+#include "kernel/filter/filter_features.h"
+#ifdef __KERNEL_SSE3__
+#  include "kernel/filter/filter_features_sse.h"
+#endif
+
+#include "kernel/filter/filter_prefilter.h"
+
+#ifdef __KERNEL_GPU__
+#  include "kernel/filter/filter_transform_gpu.h"
+#else
+#  ifdef __KERNEL_SSE3__
+#    include "kernel/filter/filter_transform_sse.h"
+#  else
+#    include "kernel/filter/filter_transform.h"
+#  endif
+#endif
+
+#include "kernel/filter/filter_reconstruction.h"
+
+#ifdef __KERNEL_CPU__
+#  include "kernel/filter/filter_nlm_cpu.h"
+#else
+#  include "kernel/filter/filter_nlm_gpu.h"
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_divide_combined(int x, int y, int sample, ccl_global float *buffers, int offset, int stride, int pass_stride, int no_denoising_offset)
+{
+	ccl_global float *combined_buffer = buffers + (offset + y*stride + x);
+	float fac = sample / combined_buffer[3];
+	combined_buffer[0] *= fac;
+	combined_buffer[1] *= fac;
+	combined_buffer[2] *= fac;
+	combined_buffer[3] *= fac;
+	if(no_denoising_offset) {
+		combined_buffer[0] += combined_buffer[no_denoising_offset+0];
+		combined_buffer[1] += combined_buffer[no_denoising_offset+1];
+		combined_buffer[2] += combined_buffer[no_denoising_offset+2];
+	}
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/filter/filter_nlm_cpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_cpu.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			float diff = 0.0f;
+			int numChannels = channel_offset? 3 : 1;
+			for(int c = 0; c < numChannels; c++) {
+				float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
+				float pvar = varianceImage[c*channel_offset + y*w+x];
+				float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
+				diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+			}
+			if(numChannels > 1) {
+				diff *= 1.0f/numChannels;
+			}
+			differenceImage[y*w+x] = diff;
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+{
+#ifdef __KERNEL_SSE3__
+	int aligned_lowx = (rect.x & ~(3));
+	int aligned_highx = ((rect.z + 3) & ~(3));
+#endif
+	for(int y = rect.y; y < rect.w; y++) {
+		const int low = max(rect.y, y-f);
+		const int high = min(rect.w, y+f+1);
+		for(int x = rect.x; x < rect.z; x++) {
+			outImage[y*w+x] = 0.0f;
+		}
+		for(int y1 = low; y1 < high; y1++) {
+#ifdef __KERNEL_SSE3__
+			for(int x = aligned_lowx; x < aligned_highx; x+=4) {
+				_mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x)));
+			}
+#else
+			for(int x = rect.x; x < rect.z; x++) {
+				outImage[y*w+x] += differenceImage[y1*w+x];
+			}
+#endif
+		}
+		for(int x = rect.x; x < rect.z; x++) {
+			outImage[y*w+x] *= 1.0f/(high - low);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			outImage[y*w+x] = 0.0f;
+		}
+	}
+	for(int dx = -f; dx <= f; dx++) {
+		int pos_dx = max(0, dx);
+		int neg_dx = min(0, dx);
+		for(int y = rect.y; y < rect.w; y++) {
+			for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
+				outImage[y*w+x] += differenceImage[y*w+dx+x];
+			}
+		}
+	}
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f));
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			float sum = 0.0f;
+			for(int x1 = low; x1 < high; x1++) {
+				sum += differenceImage[y*w+x1];
+			}
+			float weight = sum * (1.0f/(high - low));
+			accumImage[y*w+x] += weight;
+			outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
+                                                           float ccl_restrict_ptr differenceImage,
+                                                           float ccl_restrict_ptr buffer,
+                                                           float *color_pass,
+                                                           float *variance_pass,
+                                                           float *transform,
+                                                           int *rank,
+                                                           float *XtWX,
+                                                           float3 *XtWY,
+                                                           int4 rect,
+                                                           int4 filter_rect,
+                                                           int w, int h, int f,
+                                                           int pass_stride)
+{
+	/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
+	for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
+		int y = fy + filter_rect.y;
+		for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
+			int x = fx + filter_rect.x;
+			const int low = max(rect.x, x-f);
+			const int high = min(rect.z, x+f+1);
+			float sum = 0.0f;
+			for(int x1 = low; x1 < high; x1++) {
+				sum += differenceImage[y*w+x1];
+			}
+			float weight = sum * (1.0f/(high - low));
+
+			int storage_ofs = fy*filter_rect.z + fx;
+			float  *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
+			float  *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
+			float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
+			int    *l_rank = rank + storage_ofs;
+
+			kernel_filter_construct_gramian(x, y, 1,
+			                                dx, dy, w, h,
+			                                pass_stride,
+			                                buffer,
+			                                color_pass, variance_pass,
+			                                l_transform, l_rank,
+			                                weight, l_XtWX, l_XtWY, 0);
+		}
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w)
+{
+	for(int y = rect.y; y < rect.w; y++) {
+		for(int x = rect.x; x < rect.z; x++) {
+			outImage[y*w+x] /= accumImage[y*w+x];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/filter/filter_nlm_gpu.h
+++ b/intern/cycles/kernel/filter/filter_nlm_gpu.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
+                                                         int dx, int dy,
+                                                         ccl_global float ccl_restrict_ptr weightImage,
+                                                         ccl_global float ccl_restrict_ptr varianceImage,
+                                                         ccl_global float *differenceImage,
+                                                         int4 rect, int w,
+                                                         int channel_offset,
+                                                         float a, float k_2)
+{
+	float diff = 0.0f;
+	int numChannels = channel_offset? 3 : 1;
+	for(int c = 0; c < numChannels; c++) {
+		float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
+		float pvar = varianceImage[c*channel_offset + y*w+x];
+		float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
+		diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
+	}
+	if(numChannels > 1) {
+		diff *= 1.0f/numChannels;
+	}
+	differenceImage[y*w+x] = diff;
+}
+
+ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
+                                              ccl_global float ccl_restrict_ptr differenceImage,
+                                              ccl_global float *outImage,
+                                              int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.y, y-f);
+	const int high = min(rect.w, y+f+1);
+	for(int y1 = low; y1 < high; y1++) {
+		sum += differenceImage[y1*w+x];
+	}
+	sum *= 1.0f/(high-low);
+	outImage[y*w+x] = sum;
+}
+
+ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
+                                                     ccl_global float ccl_restrict_ptr differenceImage,
+                                                     ccl_global float *outImage,
+                                                     int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	for(int x1 = low; x1 < high; x1++) {
+		sum += differenceImage[y*w+x1];
+	}
+	sum *= 1.0f/(high-low);
+	outImage[y*w+x] = expf(-max(sum, 0.0f));
+}
+
+ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
+                                                       int dx, int dy,
+                                                       ccl_global float ccl_restrict_ptr differenceImage,
+                                                       ccl_global float ccl_restrict_ptr image,
+                                                       ccl_global float *outImage,
+                                                       ccl_global float *accumImage,
+                                                       int4 rect, int w, int f)
+{
+	float sum = 0.0f;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	for(int x1 = low; x1 < high; x1++) {
+		sum += differenceImage[y*w+x1];
+	}
+	sum *= 1.0f/(high-low);
+	if(outImage) {
+		accumImage[y*w+x] += sum;
+		outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+	}
+	else {
+		accumImage[y*w+x] = sum;
+	}
+}
+
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
+                                                           int dx, int dy,
+                                                           ccl_global float ccl_restrict_ptr differenceImage,
+                                                           ccl_global float ccl_restrict_ptr buffer,
+                                                           ccl_global float *color_pass,
+                                                           ccl_global float *variance_pass,
+                                                           ccl_global float ccl_restrict_ptr transform,
+                                                           ccl_global int *rank,
+                                                           ccl_global float *XtWX,
+                                                           ccl_global float3 *XtWY,
+                                                           int4 rect,
+                                                           int4 filter_rect,
+                                                           int w, int h, int f,
+                                                           int pass_stride,
+                                                           int localIdx)
+{
+	int y = fy + filter_rect.y;
+	int x = fx + filter_rect.x;
+	const int low = max(rect.x, x-f);
+	const int high = min(rect.z, x+f+1);
+	float sum = 0.0f;
+	for(int x1 = low; x1 < high; x1++) {
+		sum += differenceImage[y*w+x1];
+	}
+	float weight = sum * (1.0f/(high - low));
+
+	int storage_ofs = fy*filter_rect.z + fx;
+	transform += storage_ofs;
+	rank += storage_ofs;
+	XtWX += storage_ofs;
+	XtWY += storage_ofs;
+
+	kernel_filter_construct_gramian(x, y,
+	                                filter_rect.z*filter_rect.w,
+	                                dx, dy, w, h,
+	                                pass_stride,
+	                                buffer,
+	                                color_pass, variance_pass,
+	                                transform, rank,
+	                                weight, XtWX, XtWY,
+	                                localIdx);
+}
+
+ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
+                                                   ccl_global float *outImage,
+                                                   ccl_global float ccl_restrict_ptr accumImage,
+                                                   int4 rect, int w)
+{
+	outImage[y*w+x] /= accumImage[y*w+x];
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/filter/filter_prefilter.h
+++ b/intern/cycles/kernel/filter/filter_prefilter.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+/* First step of the shadow prefiltering, performs the shadow division and stores all data
+ * in a nice and easy rectangular array that can be passed to the NLM filter.
+ *
+ * Calculates:
+ * unfiltered: Contains the two half images of the shadow feature pass
+ * sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
+ * sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
+ * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
+ */
+ccl_device void kernel_filter_divide_shadow(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int x, int y,
+                                            ccl_global float *unfilteredA,
+                                            ccl_global float *unfilteredB,
+                                            ccl_global float *sampleVariance,
+                                            ccl_global float *sampleVarianceV,
+                                            ccl_global float *bufferVariance,
+                                            int4 rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset,
+                                            bool use_split_variance)
+{
+	int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+	int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+	int tile = ytile*3+xtile;
+
+	int offset = tiles->offsets[tile];
+	int stride = tiles->strides[tile];
+	ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile];
+	center_buffer += (y*stride + x + offset)*buffer_pass_stride;
+	center_buffer += buffer_denoising_offset + 14;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+	unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
+	unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
+
+	float varA = center_buffer[2];
+	float varB = center_buffer[5];
+	int odd_sample = (sample+1)/2;
+	int even_sample = sample/2;
+	if(use_split_variance) {
+		varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+		varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+	}
+	varA /= (odd_sample - 1);
+	varB /= (even_sample - 1);
+
+	sampleVariance[idx]  = 0.5f*(varA + varB) / sample;
+	sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
+	bufferVariance[idx]  = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
+}
+
+/* Load a regular feature from the render buffers into the denoise buffer.
+ * Parameters:
+ * - sample: The sample amount in the buffer, used to normalize the buffer.
+ * - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
+ * - x, y: Current pixel
+ * - mean, variance: Target denoise buffers.
+ * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
+ */
+ccl_device void kernel_filter_get_feature(int sample, 
+                                          ccl_global TilesInfo *tiles,
+                                          int m_offset, int v_offset,
+                                          int x, int y,
+                                          ccl_global float *mean,
+                                          ccl_global float *variance,
+                                          int4 rect, int buffer_pass_stride,
+                                          int buffer_denoising_offset,
+                                          bool use_split_variance)
+{
+	int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
+	int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
+	int tile = ytile*3+xtile;
+	ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+	mean[idx] = center_buffer[m_offset] / sample;
+	if(use_split_variance) {
+		variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
+	}
+	else {
+		variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
+	}
+}
+
+/* Combine A/B buffers.
+ * Calculates the combined mean and the buffer variance. */
+ccl_device void kernel_filter_combine_halves(int x, int y,
+                                             ccl_global float *mean,
+                                             ccl_global float *variance,
+                                             ccl_global float *a,
+                                             ccl_global float *b,
+                                             int4 rect, int r)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+	int idx = (y-rect.y)*buffer_w + (x - rect.x);
+
+	if(mean)     mean[idx] = 0.5f * (a[idx]+b[idx]);
+	if(variance) {
+		if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
+		else {
+			variance[idx] = 0.0f;
+			float values[25];
+			int numValues = 0;
+			for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
+				for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
+					int pidx = (py-rect.y)*buffer_w + (px-rect.x);
+					values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
+				}
+			}
+			/* Insertion-sort the variances (fast enough for 25 elements). */
+			for(int i = 1; i < numValues; i++) {
+				float v = values[i];
+				int j;
+				for(j = i-1; j >= 0 && values[j] > v; j--)
+					values[j+1] = values[j];
+				values[j+1] = v;
+			}
+			variance[idx] = values[(7*numValues)/8];
+		}
+	}
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
+                                                       int storage_stride,
+                                                       int dx, int dy,
+                                                       int w, int h,
+                                                       int pass_stride,
+                                                       ccl_global float ccl_restrict_ptr buffer,
+                                                       ccl_global float *color_pass,
+                                                       ccl_global float *variance_pass,
+                                                       ccl_global float ccl_restrict_ptr transform,
+                                                       ccl_global int *rank,
+                                                       float weight,
+                                                       ccl_global float *XtWX,
+                                                       ccl_global float3 *XtWY,
+                                                       int localIdx)
+{
+	int p_offset =  y    *w +  x;
+	int q_offset = (y+dy)*w + (x+dx);
+
+#ifdef __KERNEL_CPU__
+	const int stride = 1;
+	(void)storage_stride;
+	(void)localIdx;
+	float design_row[DENOISE_FEATURES+1];
+#elif defined(__KERNEL_CUDA__)
+	const int stride = storage_stride;
+	ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
+#else
+	const int stride = storage_stride;
+	float design_row[DENOISE_FEATURES+1];
+#endif
+
+	float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
+	float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
+
+	float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
+	float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
+
+	if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
+		return;
+	}
+
+	filter_get_design_row_transform(make_int2(x, y),       buffer + p_offset,
+	                                make_int2(x+dx, y+dy), buffer + q_offset,
+	                                pass_stride, *rank, design_row, transform, stride);
+
+	math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
+	math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+}
+
+ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+                                              ccl_global float *buffer,
+                                              ccl_global int *rank,
+                                              int storage_stride,
+                                              ccl_global float *XtWX,
+                                              ccl_global float3 *XtWY,
+                                              int4 buffer_params,
+                                              int sample)
+{
+#ifdef __KERNEL_CPU__
+	const int stride = 1;
+	(void)storage_stride;
+#else
+	const int stride = storage_stride;
+#endif
+
+	math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
+
+	float3 final_color = XtWY[0];
+
+	ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
+	final_color *= sample;
+	if(buffer_params.w) {
+		final_color.x += combined_buffer[buffer_params.w+0];
+		final_color.y += combined_buffer[buffer_params.w+1];
+		final_color.z += combined_buffer[buffer_params.w+2];
+	}
+	combined_buffer[0] = final_color.x;
+	combined_buffer[1] = final_color.y;
+	combined_buffer[2] = final_color.z;
+}
+
+#undef STORAGE_TYPE
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/filter/filter_transform.h
+++ b/intern/cycles/kernel/filter/filter_transform.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  float *transform, int *rank,
+                                                  int radius, float pca_threshold)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	float features[DENOISE_FEATURES];
+
+	/* Temporary storage, used in different steps of the algorithm. */
+	float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	float tempvector[2*DENOISE_FEATURES];
+	float ccl_restrict_ptr pixel_buffer;
+	int2 pixel;
+
+
+
+
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+
+
+
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES];
+	math_vector_zero(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add(feature_means, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
+	math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float *feature_scale = tempvector;
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	filter_calculate_scale(feature_scale);
+
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float* feature_matrix = tempmatrix;
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+	} END_FOR_PIXEL_WINDOW
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+	*rank = 0;
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+			/* Bake the feature scaling into the transformation matrix. */
+			math_vector_mul(transform + (*rank)*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+		}
+	}
+	else {
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+			/* Bake the feature scaling into the transformation matrix. */
+			math_vector_mul(transform + (*rank)*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
+		}
+	}
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/filter/filter_transform_gpu.h
+++ b/intern/cycles/kernel/filter/filter_transform_gpu.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  ccl_global float *transform,
+                                                  ccl_global int *rank,
+                                                  int radius, float pca_threshold,
+                                                  int transform_stride, int localIdx)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+#ifdef __KERNEL_CUDA__
+	ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
+	ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
+#else
+	float features[DENOISE_FEATURES];
+#endif
+
+	/* === Calculate denoising window. === */
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+	ccl_global float ccl_restrict_ptr pixel_buffer;
+	int2 pixel;
+
+
+
+
+	/* === Shift feature passes to have mean 0. === */
+	float feature_means[DENOISE_FEATURES];
+	math_vector_zero(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add(feature_means, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
+	math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
+
+	/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
+	float feature_scale[DENOISE_FEATURES];
+	math_vector_zero(feature_scale, DENOISE_FEATURES);
+
+	FOR_PIXEL_WINDOW {
+		filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_max(feature_scale, features, DENOISE_FEATURES);
+	} END_FOR_PIXEL_WINDOW
+
+	filter_calculate_scale(feature_scale);
+
+
+
+	/* === Generate the feature transformation. ===
+	 * This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
+	 * which generally has fewer dimensions. This mainly helps to prevent overfitting. */
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_zero(feature_matrix, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW {
+		filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul(features, feature_scale, DENOISE_FEATURES);
+		math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
+	} END_FOR_PIXEL_WINDOW
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
+	*rank = 0;
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+			/* Bake the feature scaling into the transformation matrix. */
+			math_vector_mul_strided(transform + i*DENOISE_FEATURES*transform_stride, feature_scale, transform_stride, DENOISE_FEATURES);
+		}
+	}
+	else {
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+			/* Bake the feature scaling into the transformation matrix. */
+			math_vector_mul_strided(transform + i*DENOISE_FEATURES*transform_stride, feature_scale, transform_stride, DENOISE_FEATURES);
+		}
+	}
+	math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/filter/filter_transform_sse.h
+++ b/intern/cycles/kernel/filter/filter_transform_sse.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
+                                                  int x, int y, int4 rect,
+                                                  int pass_stride,
+                                                  float *transform, int *rank,
+                                                  int radius, float pca_threshold)
+{
+	int buffer_w = align_up(rect.z - rect.x, 4);
+
+	__m128 features[DENOISE_FEATURES];
+	float ccl_restrict_ptr pixel_buffer;
+	int2 pixel;
+
+	int2 low  = make_int2(max(rect.x, x - radius),
+	                      max(rect.y, y - radius));
+	int2 high = make_int2(min(rect.z, x + radius + 1),
+	                      min(rect.w, y + radius + 1));
+
+	__m128 feature_means[DENOISE_FEATURES];
+	math_vector_zero_sse(feature_means, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
+		math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	__m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x)));
+	for(int i = 0; i < DENOISE_FEATURES; i++) {
+		feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
+	}
+
+	__m128 feature_scale[DENOISE_FEATURES];
+	math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+		for(int i = 0; i < DENOISE_FEATURES; i++)
+			feature_scale[i] = _mm_max_ps(feature_scale[i], features[i]);
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	filter_calculate_scale_sse(feature_scale);
+
+	__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
+	FOR_PIXEL_WINDOW_SSE {
+		filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
+		math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
+		math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
+	} END_FOR_PIXEL_WINDOW_SSE
+
+	float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
+	math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
+
+	math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
+
+	*rank = 0;
+	if(pca_threshold < 0.0f) {
+		float threshold_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++) {
+			threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
+		}
+		threshold_energy *= 1.0f - (-pca_threshold);
+
+		float reduced_energy = 0.0f;
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			if(i >= 2 && reduced_energy >= threshold_energy)
+				break;
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			reduced_energy += s;
+			/* Bake the feature scaling into the transformation matrix. */
+			for(int j = 0; j < DENOISE_FEATURES; j++) {
+				transform[(*rank)*DENOISE_FEATURES + j] *= _mm_cvtss_f32(feature_scale[j]);
+			}
+		}
+	}
+	else {
+		for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
+			float s = feature_matrix[i*DENOISE_FEATURES+i];
+			if(i >= 2 && sqrtf(s) < pca_threshold)
+				break;
+			/* Bake the feature scaling into the transformation matrix. */
+			for(int j = 0; j < DENOISE_FEATURES; j++) {
+				transform[(*rank)*DENOISE_FEATURES + j] *= _mm_cvtss_f32(feature_scale[j]);
+			}
+		}
+	}
+	math_matrix_transpose(transform, DENOISE_FEATURES, 1);
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/geom/geom_triangle.h
+++ b/intern/cycles/kernel/geom/geom_triangle.h
@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3

 /* Interpolate smooth vertex normal from vertices */

-ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
+ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
 {
 	/* load triangle vertices */
 	const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 	float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
 	float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));

-	return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+	float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
+
+	return is_zero(N)? Ng: N;
 }

 /* Ray differentials on triangle */
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
 #define KERNEL_ARCH cpu
 #include "kernel/kernels/cpu/kernel_cpu.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu.h"
-#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu.h"

 CCL_NAMESPACE_END

--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -222,6 +222,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 	L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
 	L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
 #endif
+
+#ifdef __DENOISING_FEATURES__
+	L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
+	L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
+	L->denoising_depth = 0.0f;
+#endif  /* __DENOISING_FEATURES__ */
 }

 ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
@@ -277,15 +283,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
 }

 ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
+                                              ccl_addr_space PathState *state,
                                              float3 throughput,
                                              float3 alpha,
                                              float3 bsdf,
-                                              float3 ao,
-                                              int bounce)
+                                              float3 ao)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0) {
+		if(state->bounce == 0) {
 			/* directly visible lighting */
 			L->direct_diffuse += throughput*bsdf*ao;
 			L->ao += alpha*throughput*ao;
@@ -302,31 +308,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
 	}

 #ifdef __SHADOW_TRICKS__
-	float3 light = throughput * bsdf;
-	L->path_total += light;
-	L->path_total_shaded += ao * light;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf;
+		L->path_total += light;
+		L->path_total_shaded += ao * light;
+	}
 #endif
 }

 ccl_device_inline void path_radiance_accum_total_ao(
        PathRadiance *L,
+        ccl_addr_space PathState *state,
        float3 throughput,
        float3 bsdf)
 {
 #ifdef __SHADOW_TRICKS__
-	L->path_total += throughput * bsdf;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * bsdf;
+	}
 #else
 	(void) L;
+	(void) state;
 	(void) throughput;
 	(void) bsdf;
 #endif
 }

-ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
+ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
+                                                 ccl_addr_space PathState *state,
+                                                 float3 throughput,
+                                                 BsdfEval *bsdf_eval,
+                                                 float3 shadow,
+                                                 float shadow_fac,
+                                                 bool is_lamp)
 {
 #ifdef __PASSES__
 	if(L->use_light_pass) {
-		if(bounce == 0) {
+		if(state->bounce == 0) {
 			/* directly visible lighting */
 			L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
 			L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
@@ -352,21 +370,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
 	}

 #ifdef __SHADOW_TRICKS__
-	float3 light = throughput * bsdf_eval->sum_no_mis;
-	L->path_total += light;
-	L->path_total_shaded += shadow * light;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		float3 light = throughput * bsdf_eval->sum_no_mis;
+		L->path_total += light;
+		L->path_total_shaded += shadow * light;
+	}
 #endif
 }

 ccl_device_inline void path_radiance_accum_total_light(
        PathRadiance *L,
+        ccl_addr_space PathState *state,
        float3 throughput,
        const BsdfEval *bsdf_eval)
 {
 #ifdef __SHADOW_TRICKS__
-	L->path_total += throughput * bsdf_eval->sum_no_mis;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * bsdf_eval->sum_no_mis;
+	}
 #else
 	(void) L;
+	(void) state;
 	(void) throughput;
 	(void) bsdf_eval;
 #endif
@@ -393,11 +417,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
 	}

 #ifdef __SHADOW_TRICKS__
-	L->path_total += throughput * value;
-	if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
-		L->path_total_shaded += throughput * value;
+	if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
+		L->path_total += throughput * value;
+		if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
+			L->path_total_shaded += throughput * value;
+		}
 	}
 #endif
+
+#ifdef __DENOISING_FEATURES__
+	L->denoising_albedo += state->denoising_feature_weight * value;
+#endif  /* __DENOISING_FEATURES__ */
 }

 ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
@@ -555,6 +585,38 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
 	return L_sum;
 }

+ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
+{
+#ifdef __PASSES__
+	kernel_assert(L->use_light_pass);
+
+	*clean = L->emission + L->background;
+	*noisy = L->direct_scatter + L->indirect_scatter;
+
+#  define ADD_COMPONENT(flag, component)     \
+	if(kernel_data.film.denoising_flags & flag) \
+		*clean += component;                 \
+	else                                     \
+		*noisy += component;
+
+	ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR,      L->direct_diffuse);
+	ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND,      L->indirect_diffuse);
+	ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR,       L->direct_glossy);
+	ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND,       L->indirect_glossy);
+	ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
+	ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
+	ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR,   L->direct_subsurface);
+	ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND,   L->indirect_subsurface);
+#  undef ADD_COMPONENT
+#else
+	*noisy = L->emission;
+	*clean = make_float3(0.0f, 0.0f, 0.0f);
+#endif
+
+	*noisy = ensure_finite3(*noisy);
+	*clean = ensure_finite3(*clean);
+}
+
 ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
 {
 	float fac = 1.0f/num_samples;
@@ -595,12 +657,12 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
 /* Calculate final light sum and transparency for shadow catcher object. */
 ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
                                                         const PathRadiance *L,
-                                                         ccl_addr_space float* L_transparent)
+                                                         float* alpha)
 {
 	const float shadow = path_radiance_sum_shadow(L);
 	float3 L_sum;
 	if(kernel_data.background.transparent) {
-		*L_transparent = shadow;
+		*alpha = 1.0f-shadow;
 		L_sum = make_float3(0.0f, 0.0f, 0.0f);
 	}
 	else {
--- a/intern/cycles/kernel/kernel_compat_cpu.h
+++ b/intern/cycles/kernel/kernel_compat_cpu.h
@@ -42,6 +42,8 @@
 #include "util/util_types.h"
 #include "util/util_texture.h"

+#define ccl_restrict_ptr const * __restrict
+
 #define ccl_addr_space

 #define ccl_local_id(d) 0
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -55,6 +55,10 @@
 #define ccl_restrict __restrict__
 #define ccl_align(n) __align__(n)

+#define ccl_restrict_ptr const * __restrict__
+#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
+
+
 /* No assert supported for CUDA */

 #define kernel_assert(cond)
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -50,6 +50,8 @@
 #  define ccl_addr_space
 #endif

+#define ccl_restrict_ptr const * __restrict__
+
 #define ccl_local_id(d) get_local_id(d)
 #define ccl_global_id(d) get_global_id(d)

--- a/intern/cycles/kernel/kernel_light.h
+++ b/intern/cycles/kernel/kernel_light.h
@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
 		float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
 		cu = clamp(cu, -1.0f, 1.0f);
 		/* Compute xu. */
-		float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
+		float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
 		xu = clamp(xu, x0, x1);
 		/* Compute yv. */
 		float z0sq = z0 * z0;
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -60,6 +60,135 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
 #endif  /* __SPLIT_KERNEL__ */
 }

+#ifdef __DENOISING_FEATURES__
+ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
+{
+	kernel_write_pass_float(buffer, sample, value);
+
+	/* The online one-pass variance update that's used for the megakernel can't easily be implemented
+	 * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
+#  ifdef __SPLIT_KERNEL__
+	kernel_write_pass_float(buffer+1, sample, value*value);
+#  else
+	if(sample == 0) {
+		kernel_write_pass_float(buffer+1, sample, 0.0f);
+	}
+	else {
+		float new_mean = buffer[0] * (1.0f / (sample + 1));
+		float old_mean = (buffer[0] - value) * (1.0f / sample);
+		kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
+	}
+#  endif
+}
+
+#  if defined(__SPLIT_KERNEL__)
+#    define kernel_write_pass_float3_unaligned kernel_write_pass_float3
+#  else
+ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
+{
+	buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
+	buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
+	buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
+}
+#  endif
+
+ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
+{
+	kernel_write_pass_float3_unaligned(buffer, sample, value);
+#  ifdef __SPLIT_KERNEL__
+	kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
+#  else
+	if(sample == 0) {
+		kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
+	}
+	else {
+		float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
+		float3 new_mean = sum * (1.0f / (sample + 1));
+		float3 old_mean = (sum - value) * (1.0f / sample);
+		kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
+	}
+#  endif
+}
+
+ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
+	int sample, float path_total, float path_total_shaded)
+{
+	if(kernel_data.film.pass_denoising_data == 0)
+		return;
+
+	buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
+
+	path_total = ensure_finite(path_total);
+	path_total_shaded = ensure_finite(path_total_shaded);
+
+	kernel_write_pass_float(buffer, sample/2, path_total);
+	kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
+
+	float value = path_total_shaded / max(path_total, 1e-7f);
+#  ifdef __SPLIT_KERNEL__
+	kernel_write_pass_float(buffer+2, sample/2, value*value);
+#  else
+	if(sample < 2) {
+		kernel_write_pass_float(buffer+2, sample/2, 0.0f);
+	}
+	else {
+		float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
+		float new_value = buffer[1] / max(buffer[0], 1e-7f);
+		kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
+	}
+#  endif
+}
+#endif /* __DENOISING_FEATURES__ */
+
+ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
+                                                        ShaderData *sd,
+                                                        ccl_global PathState *state,
+                                                        PathRadiance *L)
+{
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight == 0.0f) {
+		return;
+	}
+
+	L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
+
+	float3 normal = make_float3(0.0f, 0.0f, 0.0f);
+	float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
+	float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
+
+	for(int i = 0; i < sd->num_closure; i++) {
+		ShaderClosure *sc = &sd->closure[i];
+
+		if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
+			continue;
+
+		/* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
+		normal += sc->N * sc->sample_weight;
+		sum_weight += sc->sample_weight;
+		if(!bsdf_is_specular_like(sc)) {
+			albedo += sc->weight;
+			sum_nonspecular_weight += sc->sample_weight;
+		}
+	}
+
+	/* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
+	if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
+		if(sum_weight != 0.0f) {
+			normal /= sum_weight;
+		}
+		L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
+		L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
+
+		state->denoising_feature_weight = 0.0f;
+	}
+#else
+	(void) kg;
+	(void) sd;
+	(void) state;
+	(void) L;
+#endif  /* __DENOISING_FEATURES__ */
+}
+
 ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
 	ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
 {
@@ -199,5 +328,79 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
 #endif
 }

+ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
+	int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
+{
+	if(L) {
+		float3 L_sum;
+#ifdef __SHADOW_TRICKS__
+		if(is_shadow_catcher) {
+			L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
+		}
+		else
+#endif  /* __SHADOW_TRICKS__ */
+		{
+			L_sum = path_radiance_clamp_and_sum(kg, L);
+		}
+
+		kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
+
+		kernel_write_light_passes(kg, buffer, L, sample);
+
+#ifdef __DENOISING_FEATURES__
+		if(kernel_data.film.pass_denoising_data) {
+#  ifdef __SHADOW_TRICKS__
+			kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
+#  else
+			kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+#  endif
+			if(kernel_data.film.pass_denoising_clean) {
+				float3 noisy, clean;
+				path_radiance_split_denoising(kg, L, &noisy, &clean);
+				kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+				                                  sample, noisy);
+				kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+				                                   sample, clean);
+			}
+			else {
+				kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+				                                  sample, L_sum);
+			}
+
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+			                                  sample, L->denoising_normal);
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+			                                  sample, L->denoising_albedo);
+			kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+			                                 sample, L->denoising_depth);
+		}
+#endif  /* __DENOISING_FEATURES__ */
+	}
+	else {
+		kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
+
+#ifdef __DENOISING_FEATURES__
+		if(kernel_data.film.pass_denoising_data) {
+			kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
+
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
+			                                  sample, make_float3(0.0f, 0.0f, 0.0f));
+
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
+			                                  sample, make_float3(0.0f, 0.0f, 0.0f));
+			kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
+			                                  sample, make_float3(0.0f, 0.0f, 0.0f));
+			kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
+			                                 sample, 0.0f);
+
+			if(kernel_data.film.pass_denoising_clean) {
+				kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
+				                                   sample, make_float3(0.0f, 0.0f, 0.0f));
+			}
+		}
+#endif  /* __DENOISING_FEATURES__ */
+	}
+}
+
 CCL_NAMESPACE_END

--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -90,10 +90,10 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
 		light_ray.dD = differential3_zero();

 		if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
-			path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+			path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
 		}
 		else {
-			path_radiance_accum_total_ao(L, throughput, ao_bsdf);
+			path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
 		}
 	}
 }
@@ -366,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 			throughput /= probability;
 		}

+		kernel_update_denoising_features(kg, sd, state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
@@ -427,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
 }


-ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
-                                               RNG *rng,
-                                               int sample,
-                                               Ray ray,
-                                               ccl_global float *buffer)
+ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
+                                              RNG *rng,
+                                              int sample,
+                                              Ray ray,
+                                              ccl_global float *buffer,
+                                              PathRadiance *L,
+                                              bool *is_shadow_catcher)
 {
 	/* initialize */
-	PathRadiance L;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 	float L_transparent = 0.0f;

-	path_radiance_init(&L, kernel_data.film.use_light_pass);
+	path_radiance_init(L, kernel_data.film.use_light_pass);

 	/* shader data memory used for both volumes and surfaces, saves stack space */
 	ShaderData sd;
@@ -517,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			float3 emission;

 			if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
-				path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+				path_radiance_accum_emission(L, throughput, emission, state.bounce);
 		}
 #endif  /* __LAMP_MIS__ */

@@ -549,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,

 				/* emission */
 				if(volume_segment.closure_flag & SD_EMISSION)
-					path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+					path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);

 				/* scattering */
 				VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -559,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,

 					/* direct light sampling */
 					kernel_branched_path_volume_connect_light(kg, rng, &sd,
-						&emission_sd, throughput, &state, &L, all,
+						&emission_sd, throughput, &state, L, all,
 						&volume_ray, &volume_segment);

 					/* indirect sample. if we use distance sampling and take just
@@ -577,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 				kernel_volume_decoupled_free(kg, &volume_segment);

 				if(result == VOLUME_PATH_SCATTERED) {
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
 						continue;
 					else
 						break;
@@ -591,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			{
 				/* integrate along volume segment with distance sampling */
 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+					kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);

 #  ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* direct lighting */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);

 					/* indirect light bounce */
-					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+					if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
 						continue;
 					else
 						break;
@@ -623,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __BACKGROUND__
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, &state, throughput, L_background);
+			path_radiance_accum_background(L, &state, throughput, L_background);
 #endif  /* __BACKGROUND__ */

 			break;
@@ -640,10 +643,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #ifdef __SHADOW_TRICKS__
 		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
 			if(state.flag & PATH_RAY_CAMERA) {
-				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
 				state.catcher_object = sd.object;
 				if(!kernel_data.background.transparent) {
-					L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+					L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
 				}
 			}
 		}
@@ -677,7 +680,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #endif  /* __HOLDOUT__ */

 		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+		kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);

 		/* blurring of bsdf after bounces, for rays that have a small likelihood
 		 * of following this particular path (diffuse, rough glossy) */
@@ -695,7 +698,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 		if(sd.flag & SD_EMISSION) {
 			/* todo: is isect.t wrong here for transparent surfaces? */
 			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+			path_radiance_accum_emission(L, throughput, emission, state.bounce);
 		}
 #endif  /* __EMISSION__ */

@@ -715,10 +718,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			throughput /= probability;
 		}

+		kernel_update_denoising_features(kg, &sd, &state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
+			kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
 		}
 #endif  /* __AO__ */

@@ -729,7 +734,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			if(kernel_path_subsurface_scatter(kg,
 			                                  &sd,
 			                                  &emission_sd,
-			                                  &L,
+			                                  L,
 			                                  &state,
 			                                  rng,
 			                                  &ray,
@@ -742,15 +747,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 #endif  /* __SUBSURFACE__ */

 		/* direct lighting */
-		kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
+		kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);

 		/* compute direct lighting and next bounce */
-		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
+		if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
 			break;
 	}

 #ifdef __SUBSURFACE__
-		kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
+		kernel_path_subsurface_accum_indirect(&ss_indirect, L);

 		/* Trace indirect subsurface rays by restarting the loop. this uses less
 		 * stack memory than invoking kernel_path_indirect.
@@ -760,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 			                                      &ss_indirect,
 			                                      &state,
 			                                      &ray,
-			                                      &L,
+			                                      L,
 			                                      &throughput);
 		}
 		else {
@@ -769,24 +774,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
 	}
 #endif  /* __SUBSURFACE__ */

-	float3 L_sum;
 #ifdef __SHADOW_TRICKS__
-	if(state.flag & PATH_RAY_SHADOW_CATCHER) {
-		L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
-	}
-	else
+	*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
 #endif  /* __SHADOW_TRICKS__ */
-	{
-		L_sum = path_radiance_clamp_and_sum(kg, &L);
-	}
-
-	kernel_write_light_passes(kg, buffer, &L, sample);

 #ifdef __KERNEL_DEBUG__
 	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
 #endif  /* __KERNEL_DEBUG__ */

-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+	return 1.0f - L_transparent;
 }

 ccl_device void kernel_path_trace(KernelGlobals *kg,
@@ -807,15 +803,16 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
 	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);

 	/* integrate */
-	float4 L;
+	PathRadiance L;
+	bool is_shadow_catcher;

-	if(ray.t != 0.0f)
-		L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	if(ray.t != 0.0f) {
+		float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+		kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+	}
+	else {
+		kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+	}

 	path_rng_end(kg, rng_state, rng);
 }
--- a/intern/cycles/kernel/kernel_path_branched.h
+++ b/intern/cycles/kernel/kernel_path_branched.h
@@ -56,10 +56,10 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
 			light_ray.dD = differential3_zero();

 			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
-				path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
+				path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
 			}
 			else {
-				path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
+				path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
 			}
 		}
 	}
@@ -72,14 +72,32 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 	RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
 	float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
 {
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(state->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
 	for(int i = 0; i < sd->num_closure; i++) {
 		const ShaderClosure *sc = &sd->closure[i];

-		if(!CLOSURE_IS_BSDF(sc->type))
-			continue;
 		/* transparency is not handled here, but in outer loop */
-		if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
+		if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
 			continue;
+		}

 		int num_samples;

@@ -111,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
 			                                        &tp,
 			                                        &ps,
 			                                        L,
-			                                        &bsdf_ray))
+			                                        &bsdf_ray,
+			                                        sum_sample_weight))
 			{
 				continue;
 			}
@@ -243,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
 }
 #endif  /* __SUBSURFACE__ */

-ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
+ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
+                                                RNG *rng,
+                                                int sample,
+                                                Ray ray,
+                                                ccl_global float *buffer,
+                                                PathRadiance *L,
+                                                bool *is_shadow_catcher)
 {
 	/* initialize */
-	PathRadiance L;
 	float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
 	float L_transparent = 0.0f;

-	path_radiance_init(&L, kernel_data.film.use_light_pass);
+	path_radiance_init(L, kernel_data.film.use_light_pass);

 	/* shader data memory used for both volumes and surfaces, saves stack space */
 	ShaderData sd;
@@ -330,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				int all = kernel_data.integrator.sample_all_lights_direct;

 				kernel_branched_path_volume_connect_light(kg, rng, &sd,
-					&emission_sd, throughput, &state, &L, all,
+					&emission_sd, throughput, &state, L, all,
 					&volume_ray, &volume_segment);

 				/* indirect light sampling */
@@ -362,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 					                             &sd,
 					                             &tp,
 					                             &ps,
-					                             &L,
+					                             L,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
@@ -373,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						                     tp*num_samples_inv,
 						                     num_samples,
 						                     &ps,
-						                     &L);
+						                     L);

 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
+						path_radiance_sum_indirect(L);
+						path_radiance_reset_indirect(L);
 					}
 				}
 			}

 			/* emission and transmittance */
 			if(volume_segment.closure_flag & SD_EMISSION)
-				path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+				path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
 			throughput *= volume_segment.accum_transmittance;

 			/* free cached steps */
@@ -407,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				path_state_branch(&ps, j, num_samples);

 				VolumeIntegrateResult result = kernel_volume_integrate(
-					kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
+					kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);

 #ifdef __VOLUME_SCATTER__
 				if(result == VOLUME_PATH_SCATTERED) {
 					/* todo: support equiangular, MIS and all light sampling.
 					 * alternatively get decoupled ray marching working on the GPU */
-					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
+					kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);

 					if(kernel_path_volume_bounce(kg,
 					                             rng,
 					                             &sd,
 					                             &tp,
 					                             &ps,
-					                             &L,
+					                             L,
 					                             &pray))
 					{
 						kernel_path_indirect(kg,
@@ -431,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 						                     tp,
 						                     num_samples,
 						                     &ps,
-						                     &L);
+						                     L);

 						/* for render passes, sum and reset indirect light pass variables
 						 * for the next samples */
-						path_radiance_sum_indirect(&L);
-						path_radiance_reset_indirect(&L);
+						path_radiance_sum_indirect(L);
+						path_radiance_reset_indirect(L);
 					}
 				}
 #endif  /* __VOLUME_SCATTER__ */
@@ -462,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __BACKGROUND__
 			/* sample background shader */
 			float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
-			path_radiance_accum_background(&L, &state, throughput, L_background);
+			path_radiance_accum_background(L, &state, throughput, L_background);
 #endif  /* __BACKGROUND__ */

 			break;
@@ -476,10 +500,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #ifdef __SHADOW_TRICKS__
 		if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
 			if(state.flag & PATH_RAY_CAMERA) {
-				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
 				state.catcher_object = sd.object;
 				if(!kernel_data.background.transparent) {
-					L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
+					L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
 				}
 			}
 		}
@@ -509,13 +533,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #endif  /* __HOLDOUT__ */

 		/* holdout mask objects do not write data passes */
-		kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
+		kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);

 #ifdef __EMISSION__
 		/* emission */
 		if(sd.flag & SD_EMISSION) {
 			float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
-			path_radiance_accum_emission(&L, throughput, emission, state.bounce);
+			path_radiance_accum_emission(L, throughput, emission, state.bounce);
 		}
 #endif  /* __EMISSION__ */

@@ -539,10 +563,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 			}
 		}

+		kernel_update_denoising_features(kg, &sd, &state, L);
+
 #ifdef __AO__
 		/* ambient occlusion */
 		if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
-			kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
+			kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
 		}
 #endif  /* __AO__ */

@@ -550,7 +576,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 		/* bssrdf scatter to a different location on the same object */
 		if(sd.flag & SD_BSSRDF) {
 			kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
-			                                        &L, &state, rng, &ray, throughput);
+			                                        L, &state, rng, &ray, throughput);
 		}
 #endif  /* __SUBSURFACE__ */

@@ -563,13 +589,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 				int all = (kernel_data.integrator.sample_all_lights_direct) ||
 				          (state.flag & PATH_RAY_SHADOW_CATCHER);
 				kernel_branched_path_surface_connect_light(kg, rng,
-					&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
+					&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
 			}
 #endif  /* __EMISSION__ */

 			/* indirect light */
 			kernel_branched_path_surface_indirect_light(kg, rng,
-				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
+				&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);

 			/* continue in case of transparency */
 			throughput *= shader_bsdf_transparency(kg, &sd);
@@ -598,24 +624,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
 #endif  /* __VOLUME__ */
 	}

-	float3 L_sum;
 #ifdef __SHADOW_TRICKS__
-	if(state.flag & PATH_RAY_SHADOW_CATCHER) {
-		L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
-	}
-	else
+	*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
 #endif  /* __SHADOW_TRICKS__ */
-	{
-		L_sum = path_radiance_clamp_and_sum(kg, &L);
-	}
-
-	kernel_write_light_passes(kg, buffer, &L, sample);

 #ifdef __KERNEL_DEBUG__
 	kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
 #endif  /* __KERNEL_DEBUG__ */

-	return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
+	return 1.0f - L_transparent;
 }

 ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
@@ -636,15 +653,16 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
 	kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);

 	/* integrate */
-	float4 L;
+	PathRadiance L;
+	bool is_shadow_catcher;

-	if(ray.t != 0.0f)
-		L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
-	else
-		L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
-	/* accumulate result in output buffer */
-	kernel_write_pass_float4(buffer, sample, L);
+	if(ray.t != 0.0f) {
+		float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
+		kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
+	}
+	else {
+		kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
+	}

 	path_rng_end(kg, rng_state, rng);
 }
@@ -654,4 +672,3 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
 #endif  /* __BRANCHED_PATH__ */

 CCL_NAMESPACE_END
-
--- a/intern/cycles/kernel/kernel_path_state.h
+++ b/intern/cycles/kernel/kernel_path_state.h
@@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
 	state->transmission_bounce = 0;
 	state->transparent_bounce = 0;

+#ifdef __DENOISING_FEATURES__
+	if(kernel_data.film.pass_denoising_data) {
+		state->flag |= PATH_RAY_STORE_SHADOW_INFO;
+		state->denoising_feature_weight = 1.0f;
+	}
+	else {
+		state->denoising_feature_weight = 0.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */
+
 	state->min_ray_pdf = FLT_MAX;
 	state->ray_pdf = 0.0f;
 #ifdef __LAMP_MIS__
@@ -128,6 +138,10 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta

 	/* random number generator next bounce */
 	state->rng_offset += PRNG_BOUNCE_NUM;
+
+	if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
+		state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
+	}
 }

 ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
--- a/intern/cycles/kernel/kernel_path_surface.h
+++ b/intern/cycles/kernel/kernel_path_surface.h
@@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(

 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 						else {
-							path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+							path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
 						}
 					}
 				}
@@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(

 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 						else {
-							path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
+							path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
 						}
 					}
 				}
@@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(

 				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
-					path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
+					path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
 				}
 				else {
-					path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
+					path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
 				}
 			}
 		}
@@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
        ccl_addr_space float3 *throughput,
        ccl_addr_space PathState *state,
        PathRadiance *L,
-        ccl_addr_space Ray *ray)
+        ccl_addr_space Ray *ray,
+        float sum_sample_weight)
 {
 	/* sample BSDF */
 	float bsdf_pdf;
@@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
 	/* modify throughput */
 	path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);

+#ifdef __DENOISING_FEATURES__
+	state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
+#endif
+
 	/* modify path state */
 	path_state_next(kg, state, label);

@@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG

 			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
-				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 			}
 			else {
-				path_radiance_accum_total_light(L, throughput, &L_light);
+				path_radiance_accum_total_light(L, state, throughput, &L_light);
 			}
 		}
 	}
--- a/intern/cycles/kernel/kernel_path_volume.h
+++ b/intern/cycles/kernel/kernel_path_volume.h
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(

 			if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 				/* accumulate */
-				path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+				path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 			}
 		}
 	}
@@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(

 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 					}
 				}
@@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(

 						if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 							/* accumulate */
-							path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
+							path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
 						}
 					}
 				}
@@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(

 				if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
 					/* accumulate */
-					path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+					path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
 				}
 			}
 		}
--- a/intern/cycles/kernel/kernel_projection.h
+++ b/intern/cycles/kernel/kernel_projection.h
@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)

 ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
 {
+	if(is_zero(dir))
+		return make_float2(0.0f, 0.0f);
+
 	float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
 	float v = (acosf(dir.z / len(dir)) - range.w) / range.z;

--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
 		
 		/* smooth normal */
 		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);

 #ifdef __DPDU__
 		/* dPdu/dPdv */
@@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
 		sd->N = Ng;

 		if(sd->shader & SHADER_SMOOTH_NORMAL)
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);

 #  ifdef __DPDU__
 		/* dPdu/dPdv */
@@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
 	if(sd->type & PRIMITIVE_TRIANGLE) {
 		/* smooth normal */
 		if(sd->shader & SHADER_SMOOTH_NORMAL) {
-			sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
+			sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);

 #ifdef __INSTANCING__
 			if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -173,6 +173,8 @@ CCL_NAMESPACE_BEGIN
 #define __PATCH_EVAL__
 #define __SHADOW_TRICKS__

+#define __DENOISING_FEATURES__
+
 #ifdef __KERNEL_SHADING__
 #  define __SVM__
 #  define __EMISSION__
@@ -314,31 +316,32 @@ enum SamplingPattern {
 /* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */

 enum PathRayFlag {
-	PATH_RAY_CAMERA = 1,
-	PATH_RAY_REFLECT = 2,
-	PATH_RAY_TRANSMIT = 4,
-	PATH_RAY_DIFFUSE = 8,
-	PATH_RAY_GLOSSY = 16,
-	PATH_RAY_SINGULAR = 32,
-	PATH_RAY_TRANSPARENT = 64,
+	PATH_RAY_CAMERA              = (1 << 0),
+	PATH_RAY_REFLECT             = (1 << 1),
+	PATH_RAY_TRANSMIT            = (1 << 2),
+	PATH_RAY_DIFFUSE             = (1 << 3),
+	PATH_RAY_GLOSSY              = (1 << 4),
+	PATH_RAY_SINGULAR            = (1 << 5),
+	PATH_RAY_TRANSPARENT         = (1 << 6),

-	PATH_RAY_SHADOW_OPAQUE = 128,
-	PATH_RAY_SHADOW_TRANSPARENT = 256,
+	PATH_RAY_SHADOW_OPAQUE       = (1 << 7),
+	PATH_RAY_SHADOW_TRANSPARENT  = (1 << 8),
 	PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),

-	PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
-	PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
+	PATH_RAY_CURVE               = (1 << 9), /* visibility flag to define curve segments */
+	PATH_RAY_VOLUME_SCATTER      = (1 << 10), /* volume scattering */

 	/* Special flag to tag unaligned BVH nodes. */
-	PATH_RAY_NODE_UNALIGNED = 2048,
+	PATH_RAY_NODE_UNALIGNED = (1 << 11),

-	PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
+	PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),

-	PATH_RAY_MIS_SKIP = 4096,
-	PATH_RAY_DIFFUSE_ANCESTOR = 8192,
-	PATH_RAY_SINGLE_PASS_DONE = 16384,
-	PATH_RAY_SHADOW_CATCHER = 32768,
-	PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
+	PATH_RAY_MIS_SKIP            = (1 << 12),
+	PATH_RAY_DIFFUSE_ANCESTOR    = (1 << 13),
+	PATH_RAY_SINGLE_PASS_DONE    = (1 << 14),
+	PATH_RAY_SHADOW_CATCHER      = (1 << 15),
+	PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
+	PATH_RAY_STORE_SHADOW_INFO   = (1 << 17),
 };

 /* Closure Label */
@@ -394,6 +397,22 @@ typedef enum PassType {

 #define PASS_ALL (~0)

+typedef enum DenoisingPassOffsets {
+	DENOISING_PASS_NORMAL             = 0,
+	DENOISING_PASS_NORMAL_VAR         = 3,
+	DENOISING_PASS_ALBEDO             = 6,
+	DENOISING_PASS_ALBEDO_VAR         = 9,
+	DENOISING_PASS_DEPTH              = 12,
+	DENOISING_PASS_DEPTH_VAR          = 13,
+	DENOISING_PASS_SHADOW_A           = 14,
+	DENOISING_PASS_SHADOW_B           = 17,
+	DENOISING_PASS_COLOR              = 20,
+	DENOISING_PASS_COLOR_VAR          = 23,
+
+	DENOISING_PASS_SIZE_BASE          = 26,
+	DENOISING_PASS_SIZE_CLEAN         = 3,
+} DenoisingPassOffsets;
+
 typedef enum BakePassFilter {
 	BAKE_FILTER_NONE = 0,
 	BAKE_FILTER_DIRECT = (1 << 0),
@@ -427,6 +446,18 @@ typedef enum BakePassFilterCombos {
 	BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
 } BakePassFilterCombos;

+typedef enum DenoiseFlag {
+	DENOISING_CLEAN_DIFFUSE_DIR      = (1 << 0),
+	DENOISING_CLEAN_DIFFUSE_IND      = (1 << 1),
+	DENOISING_CLEAN_GLOSSY_DIR       = (1 << 2),
+	DENOISING_CLEAN_GLOSSY_IND       = (1 << 3),
+	DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
+	DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
+	DENOISING_CLEAN_SUBSURFACE_DIR   = (1 << 6),
+	DENOISING_CLEAN_SUBSURFACE_IND   = (1 << 7),
+	DENOISING_CLEAN_ALL_PASSES       = (1 << 8)-1,
+} DenoiseFlag;
+
 typedef ccl_addr_space struct PathRadiance {
 #ifdef __PASSES__
 	int use_light_pass;
@@ -482,6 +513,12 @@ typedef ccl_addr_space struct PathRadiance {
 	/* Color of the background on which shadow is alpha-overed. */
 	float3 shadow_color;
 #endif
+
+#ifdef __DENOISING_FEATURES__
+	float3 denoising_normal;
+	float3 denoising_albedo;
+	float denoising_depth;
+#endif  /* __DENOISING_FEATURES__ */
 } PathRadiance;

 typedef struct BsdfEval {
@@ -724,12 +761,13 @@ typedef struct AttributeDescriptor {
 #define SHADER_CLOSURE_BASE \
 	float3 weight; \
 	ClosureType type; \
-	float sample_weight \
+	float sample_weight; \
+	float3 N

 typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
 	SHADER_CLOSURE_BASE;

-	float data[14]; /* pad to 80 bytes */
+	float data[10]; /* pad to 80 bytes */
 } ShaderClosure;

 /* Shader Context
@@ -960,6 +998,10 @@ typedef struct PathState {
 	int transmission_bounce;
 	int transparent_bounce;

+#ifdef __DENOISING_FEATURES__
+	float denoising_feature_weight;
+#endif  /* __DENOISING_FEATURES__ */
+
 	/* multiple importance sampling */
 	float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
 	float ray_pdf;     /* last bounce pdf */
@@ -1137,6 +1179,11 @@ typedef struct KernelFilm {
 	float mist_inv_depth;
 	float mist_falloff;

+	int pass_denoising_data;
+	int pass_denoising_clean;
+	int denoising_flags;
+	int pad;
+
 #ifdef __KERNEL_DEBUG__
 	int pass_bvh_traversed_nodes;
 	int pass_bvh_traversed_instances;
--- a/intern/cycles/kernel/kernels/cpu/filter.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CPU kernel entry points */
+
+/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
+ * one with SSE2 intrinsics.
+ */
+#if defined(__x86_64__) || defined(_M_X64)
+#  define __KERNEL_SSE2__
+#endif
+
+/* When building kernel for native machine detect kernel features from the flags
+ * set by compiler.
+ */
+#ifdef WITH_KERNEL_NATIVE
+#  ifdef __SSE2__
+#    ifndef __KERNEL_SSE2__
+#      define __KERNEL_SSE2__
+#    endif
+#  endif
+#  ifdef __SSE3__
+#    define __KERNEL_SSE3__
+#  endif
+#  ifdef __SSSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#  ifdef __SSE4_1__
+#    define __KERNEL_SSE41__
+#  endif
+#  ifdef __AVX__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX__
+#  endif
+#  ifdef __AVX2__
+#    define __KERNEL_SSE__
+#    define __KERNEL_AVX2__
+#  endif
+#endif
+
+/* quiet unused define warnings */
+#if defined(__KERNEL_SSE2__)
+    /* do nothing */
+#endif
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2011-2014 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with AVX2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common declaration part of all CPU kernels. */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleV,
+                                                     float *sampleVV,
+                                                     float *bufferV,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset,
+                                                     bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean,
+                                                   float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset,
+                                                   bool use_split_variance);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weightImage,
+                                                           float *variance,
+                                                           float *differenceImage,
+                                                           int* rect,
+                                                           int w,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+                                                float *outImage,
+                                                int* rect,
+                                                int w,
+                                                int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+                                                       float *outImage,
+                                                       int* rect,
+                                                       int w,
+                                                       int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *differenceImage,
+                                                         float *image,
+                                                         float *outImage,
+                                                         float *accumImage,
+                                                         int* rect,
+                                                         int w,
+                                                         int f);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *differenceImage,
+                                                             float *buffer,
+                                                             float *color_pass,
+                                                             float *variance_pass,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_rect,
+                                                             int w,
+                                                             int h,
+                                                             int f,
+                                                             int pass_stride);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+                                                     float *accumImage,
+                                                     int* rect,
+                                                     int w);
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                int w,
+                                                int h,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample);
+
+#undef KERNEL_ARCH
--- a/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Templated common implementation part of all CPU kernels.
+ *
+ * The idea is that particular .cpp files sets needed optimization flags and
+ * simply includes this file without worry of copying actual implementation over.
+ */
+
+#include "kernel/kernel_compat_cpu.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+#ifdef KERNEL_STUB
+#  include "util/util_debug.h"
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
+CCL_NAMESPACE_BEGIN
+
+
+/* Denoise filter */
+
+void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
+                                                     TilesInfo *tiles,
+                                                     int x,
+                                                     int y,
+                                                     float *unfilteredA,
+                                                     float *unfilteredB,
+                                                     float *sampleVariance,
+                                                     float *sampleVarianceV,
+                                                     float *bufferVariance,
+                                                     int* prefilter_rect,
+                                                     int buffer_pass_stride,
+                                                     int buffer_denoising_offset,
+                                                     bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
+#else
+	kernel_filter_divide_shadow(sample, tiles,
+	                            x, y,
+	                            unfilteredA,
+	                            unfilteredB,
+	                            sampleVariance,
+	                            sampleVarianceV,
+	                            bufferVariance,
+	                            load_int4(prefilter_rect),
+	                            buffer_pass_stride,
+	                            buffer_denoising_offset,
+	                            use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
+                                                   TilesInfo *tiles,
+                                                   int m_offset,
+                                                   int v_offset,
+                                                   int x,
+                                                   int y,
+                                                   float *mean, float *variance,
+                                                   int* prefilter_rect,
+                                                   int buffer_pass_stride,
+                                                   int buffer_denoising_offset,
+                                                   bool use_split_variance)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
+#else
+	kernel_filter_get_feature(sample, tiles,
+	                          m_offset, v_offset,
+	                          x, y,
+	                          mean, variance,
+	                          load_int4(prefilter_rect),
+	                          buffer_pass_stride,
+	                          buffer_denoising_offset,
+	                          use_split_variance);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
+                                                      float *mean,
+                                                      float *variance,
+                                                      float *a,
+                                                      float *b,
+                                                      int* prefilter_rect,
+                                                      int r)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
+#else
+	kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
+                                                           int x,
+                                                           int y,
+                                                           int storage_ofs,
+                                                           float *transform,
+                                                           int *rank,
+                                                           int* prefilter_rect,
+                                                           int pass_stride,
+                                                           int radius,
+                                                           float pca_threshold)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
+#else
+  rank += storage_ofs;
+  transform += storage_ofs*TRANSFORM_SIZE;
+	kernel_filter_construct_transform(buffer,
+	                                  x, y,
+	                                  load_int4(prefilter_rect),
+	                                  pass_stride,
+	                                  transform,
+	                                  rank,
+	                                  radius,
+	                                  pca_threshold);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
+                                                           int dy,
+                                                           float *weightImage,
+                                                           float *variance,
+                                                           float *differenceImage,
+                                                           int *rect,
+                                                           int w,
+                                                           int channel_offset,
+                                                           float a,
+                                                           float k_2)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
+#else
+	kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
+                                                float *outImage,
+                                                int *rect,
+                                                int w,
+                                                int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
+#else
+	kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
+                                                       float *outImage,
+                                                       int *rect,
+                                                       int w,
+                                                       int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
+#else
+	kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
+                                                         int dy,
+                                                         float *differenceImage,
+                                                         float *image,
+                                                         float *outImage,
+                                                         float *accumImage,
+                                                         int *rect,
+                                                         int w,
+                                                         int f)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
+#else
+	kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
+                                                             int dy,
+                                                             float *differenceImage,
+                                                             float *buffer,
+                                                             float *color_pass,
+                                                             float *variance_pass,
+                                                             float *transform,
+                                                             int *rank,
+                                                             float *XtWX,
+                                                             float3 *XtWY,
+                                                             int *rect,
+                                                             int *filter_rect,
+                                                             int w,
+                                                             int h,
+                                                             int f,
+                                                             int pass_stride)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
+#else
+    kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
+                                                     float *accumImage,
+                                                     int *rect,
+                                                     int w)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
+#else
+	kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
+#endif
+}
+
+void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
+                                                int y,
+                                                int storage_ofs,
+                                                int w,
+                                                int h,
+                                                float *buffer,
+                                                int *rank,
+                                                float *XtWX,
+                                                float3 *XtWY,
+                                                int *buffer_params,
+                                                int sample)
+{
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, filter_finalize);
+#else
+	XtWX += storage_ofs*XTWX_SIZE;
+	XtWY += storage_ofs*XTWY_SIZE;
+	rank += storage_ofs;
+	kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+#endif
+}
+
+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE2
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2013 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
+ * optimization flags and nearly all functions inlined, while kernel.cpp
+ * is compiled without for other CPU's. */
+
+#include "util/util_optimization.h"
+
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
+#endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/filter/filter.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/filter_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
@@ -17,21 +17,23 @@
 /* Optimized CPU kernel entry points. This file is compiled with AVX
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#endif

 #include "util/util_optimization.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
@@ -18,21 +18,23 @@
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */

-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#  define __KERNEL_AVX2__
-#endif
-
 #include "util/util_optimization.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
 DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)

-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
-
 #undef KERNEL_ARCH
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -57,6 +57,11 @@
 #  include "kernel/split/kernel_buffer_update.h"
 #endif

+#ifdef KERNEL_STUB
+#  include "util/util_debug.h"
+#  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
+#endif
+
 CCL_NAMESPACE_BEGIN

 #ifndef __SPLIT_KERNEL__
@@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
                                           int offset,
                                           int stride)
 {
-#ifdef __BRANCHED_PATH__
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, path_trace);
+#else
+#  ifdef __BRANCHED_PATH__
 	if(kernel_data.integrator.branched) {
 		kernel_branched_path_trace(kg,
 		                           buffer,
@@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
 		                           stride);
 	}
 	else
-#endif
+#  endif
 	{
 		kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 	}
+#endif /* KERNEL_STUB */
 }

 /* Film */
@@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
                                                int offset,
                                                int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
+#else
 	kernel_film_convert_to_byte(kg,
 	                            rgba,
 	                            buffer,
@@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
 	                            x, y,
 	                            offset,
 	                            stride);
+#endif /* KERNEL_STUB */
 }

 void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
@@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
                                                      int offset,
                                                      int stride)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
+#else
 	kernel_film_convert_to_half_float(kg,
 	                                  rgba,
 	                                  buffer,
@@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 	                                  x, y,
 	                                  offset,
 	                                  stride);
+#endif /* KERNEL_STUB */
 }

 /* Shader Evaluate */
@@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                       int offset,
                                       int sample)
 {
+#ifdef KERNEL_STUB
+	STUB_ASSERT(KERNEL_ARCH, shader);
+#else
 	if(type >= SHADER_EVAL_BAKE) {
 		kernel_assert(output_luma == NULL);
-#ifdef __BAKING__
+#  ifdef __BAKING__
 		kernel_bake_evaluate(kg,
 		                     input,
 		                     output,
@@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		                     i,
 		                     offset,
 		                     sample);
-#endif
+#  endif
 	}
 	else {
 		kernel_shader_evaluate(kg,
@@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
 		                       i,
 		                       sample);
 	}
+#endif /* KERNEL_STUB */
 }

 #else  /* __SPLIT_KERNEL__ */

 /* Split Kernel Path Tracing */

-#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+#ifdef KERNEL_STUB
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
+	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
+	{ \
+		STUB_ASSERT(KERNEL_ARCH, name); \
+	}
+#else
+#  define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
 	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
 	{ \
 		kernel_##name(kg); \
 	}
+#endif /* KERNEL_STUB */

 #define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
 	void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
@@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
 DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
 DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
-
-void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
-{
-#define REGISTER_NAME_STRING(name) #name
-#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
-#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
-
-	REGISTER(path_trace);
-	REGISTER(convert_to_byte);
-	REGISTER(convert_to_half_float);
-	REGISTER(shader);
-
-	REGISTER(data_init);
-	REGISTER(path_init);
-	REGISTER(scene_intersect);
-	REGISTER(lamp_emission);
-	REGISTER(do_volume);
-	REGISTER(queue_enqueue);
-	REGISTER(indirect_background);
-	REGISTER(shader_setup);
-	REGISTER(shader_sort);
-	REGISTER(shader_eval);
-	REGISTER(holdout_emission_blurring_pathtermination_ao);
-	REGISTER(subsurface_scatter);
-	REGISTER(direct_lighting);
-	REGISTER(shadow_blocked_ao);
-	REGISTER(shadow_blocked_dl);
-	REGISTER(next_iteration_setup);
-	REGISTER(indirect_subsurface);
-	REGISTER(buffer_update);
-
-#undef REGISTER
-#undef REGISTER_EVAL_NAME
-#undef REGISTER_NAME_STRING
-}
-
 #endif  /* __SPLIT_KERNEL__ */

+#undef KERNEL_STUB
+#undef STUB_ASSERT
+#undef KERNEL_ARCH
+
 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx.cpp
@@ -17,22 +17,25 @@
 /* Optimized CPU kernel entry points. This file is compiled with AVX
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */
- 
-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#endif

 #define __SPLIT_KERNEL__

 #include "util/util_optimization.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_avx2.cpp
@@ -18,23 +18,25 @@
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */

-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE__
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#  define __KERNEL_AVX__
-#  define __KERNEL_AVX2__
-#endif
-
 #define __SPLIT_KERNEL__

 #include "util/util_optimization.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_avx2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE__
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#    define __KERNEL_AVX__
+#    define __KERNEL_AVX2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_avx2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse2.cpp
@@ -18,17 +18,19 @@
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */

-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#endif
-
 #define __SPLIT_KERNEL__

 #include "util/util_optimization.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse3.cpp
@@ -18,19 +18,21 @@
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */

-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#endif
-
 #define __SPLIT_KERNEL__

 #include "util/util_optimization.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_split_sse41.cpp
@@ -18,20 +18,22 @@
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */

-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#endif
-
 #define __SPLIT_KERNEL__

 #include "util/util_optimization.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
@@ -18,15 +18,17 @@
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */

-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#endif
-
 #include "util/util_optimization.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse2
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse2
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
@@ -18,17 +18,19 @@
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */

-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#endif
-
 #include "util/util_optimization.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse3
-#  include "kernel/kernels/cpu/kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse3
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
+++ b/intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp
@@ -18,18 +18,20 @@
 * optimization flags and nearly all functions inlined, while kernel.cpp
 * is compiled without for other CPU's. */

-/* SSE optimization disabled for now on 32 bit, see bug #36316 */
-#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
-#  define __KERNEL_SSE2__
-#  define __KERNEL_SSE3__
-#  define __KERNEL_SSSE3__
-#  define __KERNEL_SSE41__
-#endif
-
 #include "util/util_optimization.h"

-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-#  include "kernel/kernel.h"
-#  define KERNEL_ARCH cpu_sse41
-#  include "kernel/kernels/cpu//kernel_cpu_impl.h"
+#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
+#  define KERNEL_STUB
+#else
+/* SSE optimization disabled for now on 32 bit, see bug #36316 */
+#  if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
+#    define __KERNEL_SSE2__
+#    define __KERNEL_SSE3__
+#    define __KERNEL_SSSE3__
+#    define __KERNEL_SSE41__
+#  endif
 #endif  /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
+
+#include "kernel/kernel.h"
+#define KERNEL_ARCH cpu_sse41
+#include "kernel/kernels/cpu/kernel_cpu_impl.h"
--- a/intern/cycles/kernel/kernels/cuda/filter.cu
+++ b/intern/cycles/kernel/kernels/cuda/filter.cu
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* CUDA kernel entry points */
+
+#ifdef __CUDA_ARCH__
+
+#include "kernel_config.h"
+
+#include "kernel/kernel_compat_cuda.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_divide_shadow(int sample,
+                                 TilesInfo *tiles,
+                                 float *unfilteredA,
+                                 float *unfilteredB,
+                                 float *sampleVariance,
+                                 float *sampleVarianceV,
+                                 float *bufferVariance,
+                                 int4 prefilter_rect,
+                                 int buffer_pass_stride,
+                                 int buffer_denoising_offset,
+                                 bool use_split_variance)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset,
+		                            use_split_variance);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_get_feature(int sample,
+                               TilesInfo *tiles,
+                               int m_offset,
+                               int v_offset,
+                               float *mean,
+                               float *variance,
+                               int4 prefilter_rect,
+                               int buffer_pass_stride,
+                               int buffer_denoising_offset,
+                               bool use_split_variance)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset,
+		                          use_split_variance);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
+{
+	int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
+	int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
+                                       float *transform, int *rank,
+                                       int4 filter_area, int4 rect,
+                                       int radius, float pca_threshold,
+                                       int pass_stride)
+{
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int *l_rank = rank + y*filter_area.z + x;
+		float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
+                                       float ccl_restrict_ptr weightImage,
+                                       float ccl_restrict_ptr varianceImage,
+                                       float *differenceImage,
+                                       int4 rect, int w,
+                                       int channel_offset,
+                                       float a, float k_2) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_update_output(int dx, int dy,
+                                     float ccl_restrict_ptr differenceImage,
+                                     float ccl_restrict_ptr image,
+                                     float *outImage, float *accumImage,
+                                     int4 rect, int w,
+                                     int f) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
+                                         float ccl_restrict_ptr differenceImage,
+                                         float ccl_restrict_ptr buffer,
+                                         float *color_pass,
+                                         float *variance_pass,
+                                         float const* __restrict__ transform,
+                                         int *rank,
+                                         float *XtWX,
+                                         float3 *XtWY,
+                                         int4 rect,
+                                         int4 filter_rect,
+                                         int w, int h, int f,
+                                         int pass_stride) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
+	int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
+	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+		kernel_filter_nlm_construct_gramian(x, y,
+		                                    dx, dy,
+		                                    differenceImage,
+		                                    buffer,
+		                                    color_pass, variance_pass,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_rect,
+		                                    w, h, f,
+		                                    pass_stride,
+		                                    threadIdx.y*blockDim.x + threadIdx.x);
+	}
+}
+
+extern "C" __global__ void
+CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
+kernel_cuda_filter_finalize(int w, int h,
+                            float *buffer, int *rank,
+                            float *XtWX, float3 *XtWY,
+                            int4 filter_area, int4 buffer_params,
+                            int sample) {
+	int x = blockDim.x*blockIdx.x + threadIdx.x;
+	int y = blockDim.y*blockIdx.y + threadIdx.y;
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+	}
+}
+
+#endif
+
--- a/intern/cycles/kernel/kernels/opencl/filter.cl
+++ b/intern/cycles/kernel/kernels/opencl/filter.cl
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2011-2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* OpenCL kernel entry points */
+
+#include "kernel/kernel_compat_opencl.h"
+
+#include "kernel/filter/filter_kernel.h"
+
+/* kernels */
+
+__kernel void kernel_ocl_filter_divide_shadow(int sample,
+                                              ccl_global TilesInfo *tiles,
+                                              ccl_global float *unfilteredA,
+                                              ccl_global float *unfilteredB,
+                                              ccl_global float *sampleVariance,
+                                              ccl_global float *sampleVarianceV,
+                                              ccl_global float *bufferVariance,
+                                              int4 prefilter_rect,
+                                              int buffer_pass_stride,
+                                              int buffer_denoising_offset,
+                                              char use_split_variance)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_divide_shadow(sample,
+		                            tiles,
+		                            x, y,
+		                            unfilteredA,
+		                            unfilteredB,
+		                            sampleVariance,
+		                            sampleVarianceV,
+		                            bufferVariance,
+		                            prefilter_rect,
+		                            buffer_pass_stride,
+		                            buffer_denoising_offset,
+		                            use_split_variance);
+	}
+}
+
+__kernel void kernel_ocl_filter_get_feature(int sample,
+                                            ccl_global TilesInfo *tiles,
+                                            int m_offset,
+                                            int v_offset,
+                                            ccl_global float *mean,
+                                            ccl_global float *variance,
+                                            int4 prefilter_rect,
+                                            int buffer_pass_stride,
+                                            int buffer_denoising_offset,
+                                            char use_split_variance)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_get_feature(sample,
+		                          tiles,
+		                          m_offset, v_offset,
+		                          x, y,
+		                          mean, variance,
+		                          prefilter_rect,
+		                          buffer_pass_stride,
+		                          buffer_denoising_offset,
+		                          use_split_variance);
+	}
+}
+
+__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
+                                               ccl_global float *variance,
+                                               ccl_global float *a,
+                                               ccl_global float *b,
+                                               int4 prefilter_rect,
+                                               int r)
+{
+	int x = prefilter_rect.x + get_global_id(0);
+	int y = prefilter_rect.y + get_global_id(1);
+	if(x < prefilter_rect.z && y < prefilter_rect.w) {
+		kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
+	}
+}
+
+__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
+                                                    ccl_global float *transform,
+                                                    ccl_global int *rank,
+                                                    int4 filter_area,
+                                                    int4 rect,
+                                                    int pass_stride,
+                                                    int radius,
+                                                    float pca_threshold)
+{
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		ccl_global int *l_rank = rank + y*filter_area.z + x;
+		ccl_global float *l_transform = transform + y*filter_area.z + x;
+		kernel_filter_construct_transform(buffer,
+		                                  x + filter_area.x, y + filter_area.y,
+		                                  rect, pass_stride,
+		                                  l_transform, l_rank,
+		                                  radius, pca_threshold,
+		                                  filter_area.z*filter_area.w,
+		                                  get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
+                                                    int dy,
+                                                    ccl_global float ccl_restrict_ptr weightImage,
+                                                    ccl_global float ccl_restrict_ptr varianceImage,
+                                                    ccl_global float *differenceImage,
+                                                    int4 rect,
+                                                    int w,
+                                                    int channel_offset,
+                                                    float a,
+                                                    float k_2) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
+                                         ccl_global float *outImage,
+                                         int4 rect,
+                                         int w,
+                                         int f) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
+                                                ccl_global float *outImage,
+                                                int4 rect,
+                                                int w,
+                                                int f) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_update_output(int dx,
+                                                  int dy,
+                                                  ccl_global float ccl_restrict_ptr differenceImage,
+                                                  ccl_global float ccl_restrict_ptr image,
+                                                  ccl_global float *outImage,
+                                                  ccl_global float *accumImage,
+                                                  int4 rect,
+                                                  int w,
+                                                  int f) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
+                                              ccl_global float ccl_restrict_ptr accumImage,
+                                              int4 rect,
+                                              int w) {
+	int x = get_global_id(0) + rect.x;
+	int y = get_global_id(1) + rect.y;
+	if(x < rect.z && y < rect.w) {
+		kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
+	}
+}
+
+__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
+                                                      int dy,
+                                                      ccl_global float ccl_restrict_ptr differenceImage,
+                                                      ccl_global float ccl_restrict_ptr buffer,
+                                                      ccl_global float *color_pass,
+                                                      ccl_global float *variance_pass,
+                                                      ccl_global float ccl_restrict_ptr transform,
+                                                      ccl_global int *rank,
+                                                      ccl_global float *XtWX,
+                                                      ccl_global float3 *XtWY,
+                                                      int4 rect,
+                                                      int4 filter_rect,
+                                                      int w,
+                                                      int h,
+                                                      int f,
+                                                      int pass_stride) {
+	int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
+	int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
+	if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
+		kernel_filter_nlm_construct_gramian(x, y,
+		                                    dx, dy,
+		                                    differenceImage,
+		                                    buffer,
+		                                    color_pass, variance_pass,
+		                                    transform, rank,
+		                                    XtWX, XtWY,
+		                                    rect, filter_rect,
+		                                    w, h, f,
+		                                    pass_stride,
+		                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
+	}
+}
+
+__kernel void kernel_ocl_filter_finalize(int w,
+	                                     int h,
+                                         ccl_global float *buffer,
+                                         ccl_global int *rank,
+                                         ccl_global float *XtWX,
+                                         ccl_global float3 *XtWY,
+                                         int4 filter_area,
+                                         int4 buffer_params,
+                                         int sample) {
+	int x = get_global_id(0);
+	int y = get_global_id(1);
+	if(x < filter_area.z && y < filter_area.w) {
+		int storage_ofs = y*filter_area.z+x;
+		rank += storage_ofs;
+		XtWX += storage_ofs;
+		XtWY += storage_ofs;
+		kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+	}
+}
+
+__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
+                                          ccl_global float *buffer_1,
+                                          ccl_global float *buffer_2,
+                                          ccl_global float *buffer_3,
+                                          ccl_global float *buffer_4,
+                                          ccl_global float *buffer_5,
+                                          ccl_global float *buffer_6,
+                                          ccl_global float *buffer_7,
+                                          ccl_global float *buffer_8,
+                                          ccl_global float *buffer_9)
+{
+	if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
+		tiles->buffers[0] = buffer_1;
+		tiles->buffers[1] = buffer_2;
+		tiles->buffers[2] = buffer_3;
+		tiles->buffers[3] = buffer_4;
+		tiles->buffers[4] = buffer_5;
+		tiles->buffers[5] = buffer_6;
+		tiles->buffers[6] = buffer_7;
+		tiles->buffers[7] = buffer_8;
+		tiles->buffers[8] = buffer_9;
+	}
+}
--- a/intern/cycles/kernel/split/kernel_branched.h
+++ b/intern/cycles/kernel/split/kernel_branched.h
@@ -76,6 +76,26 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 	RNG rng = kernel_split_state.rng[ray_index];
 	PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
 	float3 throughput = branched_state->throughput;
+	ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
+
+	float sum_sample_weight = 0.0f;
+#ifdef __DENOISING_FEATURES__
+	if(ps->denoising_feature_weight > 0.0f) {
+		for(int i = 0; i < sd->num_closure; i++) {
+			const ShaderClosure *sc = &sd->closure[i];
+
+			/* transparency is not handled here, but in outer loop */
+			if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
+				continue;
+			}
+
+			sum_sample_weight += sc->sample_weight;
+		}
+	}
+	else {
+		sum_sample_weight = 1.0f;
+	}
+#endif  /* __DENOISING_FEATURES__ */

 	for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
 		const ShaderClosure *sc = &sd->closure[i];
@@ -103,7 +123,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 		RNG bsdf_rng = cmj_hash(rng, i);

 		for(int j = branched_state->next_sample; j < num_samples; j++) {
-			ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
 			if(reset_path_state) {
 				*ps = branched_state->path_state;
 			}
@@ -122,7 +141,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
 			                                        tp,
 			                                        ps,
 			                                        L,
-			                                        bsdf_ray))
+			                                        bsdf_ray,
+			                                        sum_sample_weight))
 			{
 				continue;
 			}
--- a/intern/cycles/kernel/split/kernel_buffer_update.h
+++ b/intern/cycles/kernel/split/kernel_buffer_update.h
@@ -111,24 +111,15 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
 	buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;

 	if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
-		float3 L_sum;
-#ifdef __SHADOW_TRICKS__
-		if(state->flag & PATH_RAY_SHADOW_CATCHER) {
-			L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
-		}
-		else
-#endif  /* __SHADOW_TRICKS__ */
-		{
-			L_sum = path_radiance_clamp_and_sum(kg, L);
-		}
 		kernel_write_light_passes(kg, buffer, L, sample);
 #ifdef __KERNEL_DEBUG__
 		kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
 #endif
-		float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));

 		/* accumulate result in output buffer */
-		kernel_write_pass_float4(buffer, sample, L_rad);
+		bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
+		kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
+
 		path_rng_end(kg, rng_state, rng);

 		ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
--- a/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
+++ b/intern/cycles/kernel/split/kernel_holdout_emission_blurring_pathtermination_ao.h
@@ -125,7 +125,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 #ifdef __SHADOW_TRICKS__
 		if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
 			if(state->flag & PATH_RAY_CAMERA) {
-				state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
+				state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
 				state->catcher_object = sd->object;
 				if(!kernel_data.background.transparent) {
 					PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
@@ -246,6 +246,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
 					kernel_split_state.throughput[ray_index] = throughput/probability;
 				}
 			}
+
+			kernel_update_denoising_features(kg, sd, state, L);
 		}
 	}

--- a/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
+++ b/intern/cycles/kernel/split/kernel_shadow_blocked_dl.h
@@ -89,10 +89,10 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
 			               &shadow))
 		{
 			/* accumulate */
-			path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
+			path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
 		}
 		else {
-			path_radiance_accum_total_light(L, throughput, &L_light);
+			path_radiance_accum_total_light(L, state, throughput, &L_light);
 		}
 	}

--- a/intern/cycles/kernel/svm/svm_closure.h
+++ b/intern/cycles/kernel/svm/svm_closure.h
@@ -444,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 			ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);

 			if(bsdf) {
+				bsdf->N = N;
 				sd->flag |= bsdf_transparent_setup(bsdf);
 			}
 			break;
@@ -704,6 +705,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
 				ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);

 				if(bsdf) {
+					bsdf->N = N;
 					/* todo: giving a fixed weight here will cause issues when
 					 * mixing multiple BSDFS. energy will not be conserved and
 					 * the throughput can blow up after multiple bounces. we
--- a/intern/cycles/kernel/svm/svm_displace.h
+++ b/intern/cycles/kernel/svm/svm_displace.h
@@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
 	strength = max(strength, 0.0f);

 	/* compute and output perturbed normal */
-	float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
-	normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+	float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
+	if(is_zero(normal_out)) {
+		normal_out = normal_in;
+	}
+	else {
+		normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
+	}

 	if(use_object_space) {
 		object_normal_transform(kg, sd, &normal_out);
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
 #ifdef __UV__
 		case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
 #endif
+		default: data = make_float3(0.0f, 0.0f, 0.0f);
 	}

 	stack_store_float3(stack, out_offset, data);
--- a/intern/cycles/kernel/svm/svm_image.h
+++ b/intern/cycles/kernel/svm/svm_image.h
@@ -317,8 +317,8 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
 	float3 co = stack_load_float3(stack, co_offset);
 	float2 uv;

-	co = normalize(co);
-	
+	co = safe_normalize(co);
+
 	if(projection == 0)
 		uv = direction_to_equirectangular(co);
 	else
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -402,7 +402,6 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_DIFFUSE_TOON_ID,

 	/* Glossy */
-	CLOSURE_BSDF_GLOSSY_ID,
 	CLOSURE_BSDF_REFLECTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
@@ -423,14 +422,13 @@ typedef enum ClosureType {
 	CLOSURE_BSDF_HAIR_REFLECTION_ID,

 	/* Transmission */
-	CLOSURE_BSDF_TRANSMISSION_ID,
 	CLOSURE_BSDF_TRANSLUCENT_ID,
 	CLOSURE_BSDF_REFRACTION_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
+	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
-	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
 	CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
 	CLOSURE_BSDF_SHARP_GLASS_ID,
 	CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
@@ -465,13 +463,16 @@ typedef enum ClosureType {
 /* watch this, being lazy with memory usage */
 #define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
-#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
-#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
+#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
+#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
 #define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
+#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
 #define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
 #define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
                                            type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
                                            type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
+#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
+                                          (type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
 #define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
 #define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
@@ -480,7 +481,7 @@ typedef enum ClosureType {
 #define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
 #define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
 #define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
-#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
+#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
 #define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)

 #define CLOSURE_WEIGHT_CUTOFF 1e-5f
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -42,6 +42,9 @@ BufferParams::BufferParams()
 	full_width = 0;
 	full_height = 0;

+	denoising_data_pass = false;
+	denoising_clean_pass = false;
+
 	Pass::add(PASS_COMBINED, passes);
 }

@@ -68,10 +71,25 @@ int BufferParams::get_passes_size()

 	for(size_t i = 0; i < passes.size(); i++)
 		size += passes[i].components;
-	
+
+	if(denoising_data_pass) {
+		size += DENOISING_PASS_SIZE_BASE;
+		if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
+	}
+
 	return align_up(size, 4);
 }

+int BufferParams::get_denoising_offset()
+{
+	int offset = 0;
+
+	for(size_t i = 0; i < passes.size(); i++)
+		offset += passes[i].components;
+
+	return offset;
+}
+
 /* Render Buffer Task */

 RenderTile::RenderTile()
@@ -138,12 +156,51 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
 	device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
 }

-bool RenderBuffers::copy_from_device()
+bool RenderBuffers::copy_from_device(Device *from_device)
 {
 	if(!buffer.device_pointer)
 		return false;

-	device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+	if(!from_device) {
+		from_device = device;
+	}
+
+	from_device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
+
+	return true;
+}
+
+bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels)
+{
+	float scale = 1.0f/sample;
+
+	if(offset == DENOISING_PASS_COLOR) {
+		scale *= exposure;
+	}
+	else if(offset == DENOISING_PASS_COLOR_VAR) {
+		scale *= exposure*exposure;
+	}
+
+	offset += params.get_denoising_offset();
+	float *in = (float*)buffer.data_pointer + offset;
+	int pass_stride = params.get_passes_size();
+	int size = params.width*params.height;
+
+	if(components == 1) {
+		for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
+			pixels[0] = in[0]*scale;
+		}
+	}
+	else if(components == 3) {
+		for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
+			pixels[0] = in[0]*scale;
+			pixels[1] = in[1]*scale;
+			pixels[2] = in[2]*scale;
+		}
+	}
+	else {
+		return false;
+	}

 	return true;
 }
--- a/intern/cycles/render/buffers.h
+++ b/intern/cycles/render/buffers.h
@@ -51,6 +51,9 @@ public:

 	/* passes */
 	array<Pass> passes;
+	bool denoising_data_pass;
+	/* If only some light path types should be denoised, an additional pass is needed. */
+	bool denoising_clean_pass;

 	/* functions */
 	BufferParams();
@@ -59,6 +62,7 @@ public:
 	bool modified(const BufferParams& params);
 	void add_pass(PassType type);
 	int get_passes_size();
+	int get_denoising_offset();
 };

 /* Render Buffers */
@@ -73,18 +77,19 @@ public:
 	/* random number generator state */
 	device_vector<uint> rng_state;

+	Device *device;
+
 	explicit RenderBuffers(Device *device);
 	~RenderBuffers();

 	void reset(Device *device, BufferParams& params);

-	bool copy_from_device();
+	bool copy_from_device(Device *from_device = NULL);
 	bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
+	bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels);

 protected:
 	void device_free();
-
-	Device *device;
 };

 /* Display Buffer
@@ -131,6 +136,9 @@ protected:

 class RenderTile {
 public:
+	typedef enum { PATH_TRACE, DENOISE } Task;
+
+	Task task;
 	int x, y, w, h;
 	int start_sample;
 	int num_samples;
@@ -138,6 +146,7 @@ public:
 	int resolution;
 	int offset;
 	int stride;
+	int tile_index;

 	device_ptr buffer;
 	device_ptr rng_state;
--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -279,6 +279,10 @@ NODE_DEFINE(Film)

 	SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);

+	SOCKET_BOOLEAN(denoising_data_pass,  "Generate Denoising Data Pass",  false);
+	SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
+	SOCKET_INT(denoising_flags, "Denoising Flags", 0);
+
 	return type;
 }

@@ -437,6 +441,20 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 		kfilm->pass_stride += pass.components;
 	}

+	kfilm->pass_denoising_data = 0;
+	kfilm->pass_denoising_clean = 0;
+	kfilm->denoising_flags = 0;
+	if(denoising_data_pass) {
+		kfilm->pass_denoising_data = kfilm->pass_stride;
+		kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
+		kfilm->denoising_flags = denoising_flags;
+		if(denoising_clean_pass) {
+			kfilm->pass_denoising_clean = kfilm->pass_stride;
+			kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
+			kfilm->use_light_pass = 1;
+		}
+	}
+
 	kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
 	kfilm->pass_alpha_threshold = pass_alpha_threshold;

@@ -451,6 +469,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
 	kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
 	kfilm->mist_falloff = mist_falloff;

+	pass_stride = kfilm->pass_stride;
+	denoising_data_offset = kfilm->pass_denoising_data;
+	denoising_clean_offset = kfilm->pass_denoising_clean;
+
 	need_update = false;
 }

--- a/Show More
+++ b/Show More