Cycles: Implement denoising option for reducing noise in the rendered image
This commit contains the first part of the new Cycles denoising option, which filters the resulting image using information gathered during rendering to get rid of noise while preserving visual features as well as possible. To use the option, enable it in the render layer options. The default settings fit a wide range of scenes, but the user can tweak individual settings to control the tradeoff between a noise-free image, image details, and calculation time. Note that the denoiser may still change in the future and that some features are not implemented yet. The most important missing feature is animation denoising, which uses information from multiple frames at once to produce a flicker-free and smoother result. These features will be added in the future. Finally, thanks to all the people who supported this project: - Google (through the GSoC) and Theory Studios for sponsoring the development - The authors of the papers I used for implementing the denoiser (more details on them will be included in the technical docs) - The other Cycles devs for feedback on the code, especially Sergey for mentoring the GSoC project and Brecht for the code review! - And of course the users who helped with testing, reported bugs and things that could and/or should work better!
This commit is contained in:
@@ -238,3 +238,15 @@ def register_passes(engine, scene, srl):
|
|||||||
if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE')
|
if crl.pass_debug_bvh_traversed_instances: engine.register_pass(scene, srl, "Debug BVH Traversed Instances", 1, "X", 'VALUE')
|
||||||
if crl.pass_debug_bvh_intersections: engine.register_pass(scene, srl, "Debug BVH Intersections", 1, "X", 'VALUE')
|
if crl.pass_debug_bvh_intersections: engine.register_pass(scene, srl, "Debug BVH Intersections", 1, "X", 'VALUE')
|
||||||
if crl.pass_debug_ray_bounces: engine.register_pass(scene, srl, "Debug Ray Bounces", 1, "X", 'VALUE')
|
if crl.pass_debug_ray_bounces: engine.register_pass(scene, srl, "Debug Ray Bounces", 1, "X", 'VALUE')
|
||||||
|
|
||||||
|
if crl.use_denoising and crl.denoising_store_passes:
|
||||||
|
engine.register_pass(scene, srl, "Denoising Normal", 3, "XYZ", 'VECTOR');
|
||||||
|
engine.register_pass(scene, srl, "Denoising Normal Variance", 3, "XYZ", 'VECTOR');
|
||||||
|
engine.register_pass(scene, srl, "Denoising Albedo", 3, "RGB", 'COLOR');
|
||||||
|
engine.register_pass(scene, srl, "Denoising Albedo Variance", 3, "RGB", 'COLOR');
|
||||||
|
engine.register_pass(scene, srl, "Denoising Depth", 1, "Z", 'VALUE');
|
||||||
|
engine.register_pass(scene, srl, "Denoising Depth Variance", 1, "Z", 'VALUE');
|
||||||
|
engine.register_pass(scene, srl, "Denoising Shadow A", 3, "XYV", 'VECTOR');
|
||||||
|
engine.register_pass(scene, srl, "Denoising Shadow B", 3, "XYV", 'VECTOR');
|
||||||
|
engine.register_pass(scene, srl, "Denoising Image", 3, "RGB", 'COLOR');
|
||||||
|
engine.register_pass(scene, srl, "Denoising Image Variance", 3, "RGB", 'COLOR');
|
@@ -1195,6 +1195,80 @@ class CyclesRenderLayerSettings(bpy.types.PropertyGroup):
|
|||||||
default=False,
|
default=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cls.use_denoising = BoolProperty(
|
||||||
|
name="Use Denoising",
|
||||||
|
description="Denoise the rendered image",
|
||||||
|
default=False,
|
||||||
|
)
|
||||||
|
cls.denoising_diffuse_direct = BoolProperty(
|
||||||
|
name="Diffuse Direct",
|
||||||
|
description="Denoise the direct diffuse lighting",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
cls.denoising_diffuse_indirect = BoolProperty(
|
||||||
|
name="Diffuse Indirect",
|
||||||
|
description="Denoise the indirect diffuse lighting",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
cls.denoising_glossy_direct = BoolProperty(
|
||||||
|
name="Glossy Direct",
|
||||||
|
description="Denoise the direct glossy lighting",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
cls.denoising_glossy_indirect = BoolProperty(
|
||||||
|
name="Glossy Indirect",
|
||||||
|
description="Denoise the indirect glossy lighting",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
cls.denoising_transmission_direct = BoolProperty(
|
||||||
|
name="Transmission Direct",
|
||||||
|
description="Denoise the direct transmission lighting",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
cls.denoising_transmission_indirect = BoolProperty(
|
||||||
|
name="Transmission Indirect",
|
||||||
|
description="Denoise the indirect transmission lighting",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
cls.denoising_subsurface_direct = BoolProperty(
|
||||||
|
name="Subsurface Direct",
|
||||||
|
description="Denoise the direct subsurface lighting",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
cls.denoising_subsurface_indirect = BoolProperty(
|
||||||
|
name="Subsurface Indirect",
|
||||||
|
description="Denoise the indirect subsurface lighting",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
cls.denoising_strength = FloatProperty(
|
||||||
|
name="Denoising Strength",
|
||||||
|
description="Controls neighbor pixel weighting for the denoising filter (lower values preserve more detail, but aren't as smooth)",
|
||||||
|
min=0.0, max=1.0,
|
||||||
|
default=0.5,
|
||||||
|
)
|
||||||
|
cls.denoising_feature_strength = FloatProperty(
|
||||||
|
name="Denoising Feature Strength",
|
||||||
|
description="Controls removal of noisy image feature passes (lower values preserve more detail, but aren't as smooth)",
|
||||||
|
min=0.0, max=1.0,
|
||||||
|
default=0.5,
|
||||||
|
)
|
||||||
|
cls.denoising_radius = IntProperty(
|
||||||
|
name="Denoising Radius",
|
||||||
|
description="Size of the image area that's used to denoise a pixel (higher values are smoother, but might lose detail and are slower)",
|
||||||
|
min=1, max=50,
|
||||||
|
default=8,
|
||||||
|
)
|
||||||
|
cls.denoising_relative_pca = BoolProperty(
|
||||||
|
name="Relative filter",
|
||||||
|
description="When removing that don't carry information, use a relative threshold instead of an absolute one (can help to reduce artifacts, but might cause detail loss around edges)",
|
||||||
|
default=False,
|
||||||
|
)
|
||||||
|
cls.denoising_store_passes = BoolProperty(
|
||||||
|
name="Store denoising passes",
|
||||||
|
description="Store the denoising feature passes and the noisy image",
|
||||||
|
default=False,
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def unregister(cls):
|
def unregister(cls):
|
||||||
del bpy.types.SceneRenderLayer.cycles
|
del bpy.types.SceneRenderLayer.cycles
|
||||||
|
@@ -530,6 +530,12 @@ class CyclesRender_PT_layer_passes(CyclesButtonsPanel, Panel):
|
|||||||
col.prop(rl, "use_pass_emit", text="Emission")
|
col.prop(rl, "use_pass_emit", text="Emission")
|
||||||
col.prop(rl, "use_pass_environment")
|
col.prop(rl, "use_pass_environment")
|
||||||
|
|
||||||
|
if context.scene.cycles.feature_set == 'EXPERIMENTAL':
|
||||||
|
col.separator()
|
||||||
|
sub = col.column()
|
||||||
|
sub.active = crl.use_denoising
|
||||||
|
sub.prop(crl, "denoising_store_passes", text="Denoising")
|
||||||
|
|
||||||
if _cycles.with_cycles_debug:
|
if _cycles.with_cycles_debug:
|
||||||
col = layout.column()
|
col = layout.column()
|
||||||
col.prop(crl, "pass_debug_bvh_traversed_nodes")
|
col.prop(crl, "pass_debug_bvh_traversed_nodes")
|
||||||
@@ -581,6 +587,71 @@ class CyclesRender_PT_views(CyclesButtonsPanel, Panel):
|
|||||||
row.prop(rv, "camera_suffix", text="")
|
row.prop(rv, "camera_suffix", text="")
|
||||||
|
|
||||||
|
|
||||||
|
class CyclesRender_PT_denoising(CyclesButtonsPanel, Panel):
|
||||||
|
bl_label = "Denoising"
|
||||||
|
bl_context = "render_layer"
|
||||||
|
bl_options = {'DEFAULT_CLOSED'}
|
||||||
|
|
||||||
|
def draw_header(self, context):
|
||||||
|
rd = context.scene.render
|
||||||
|
rl = rd.layers.active
|
||||||
|
crl = rl.cycles
|
||||||
|
cscene = context.scene.cycles
|
||||||
|
layout = self.layout
|
||||||
|
|
||||||
|
layout.active = not cscene.use_progressive_refine
|
||||||
|
layout.prop(crl, "use_denoising", text="")
|
||||||
|
|
||||||
|
def draw(self, context):
|
||||||
|
layout = self.layout
|
||||||
|
|
||||||
|
scene = context.scene
|
||||||
|
cscene = scene.cycles
|
||||||
|
rd = scene.render
|
||||||
|
rl = rd.layers.active
|
||||||
|
crl = rl.cycles
|
||||||
|
|
||||||
|
layout.active = crl.use_denoising and not cscene.use_progressive_refine
|
||||||
|
|
||||||
|
split = layout.split()
|
||||||
|
|
||||||
|
col = split.column()
|
||||||
|
sub = col.column(align=True)
|
||||||
|
sub.prop(crl, "denoising_radius", text="Radius")
|
||||||
|
sub.prop(crl, "denoising_strength", slider=True, text="Strength")
|
||||||
|
|
||||||
|
col = split.column()
|
||||||
|
sub = col.column(align=True)
|
||||||
|
sub.prop(crl, "denoising_feature_strength", slider=True, text="Feature Strength")
|
||||||
|
sub.prop(crl, "denoising_relative_pca")
|
||||||
|
|
||||||
|
layout.separator()
|
||||||
|
|
||||||
|
row = layout.row()
|
||||||
|
row.label(text="Diffuse:")
|
||||||
|
sub = row.row(align=True)
|
||||||
|
sub.prop(crl, "denoising_diffuse_direct", text="Direct", toggle=True)
|
||||||
|
sub.prop(crl, "denoising_diffuse_indirect", text="Indirect", toggle=True)
|
||||||
|
|
||||||
|
row = layout.row()
|
||||||
|
row.label(text="Glossy:")
|
||||||
|
sub = row.row(align=True)
|
||||||
|
sub.prop(crl, "denoising_glossy_direct", text="Direct", toggle=True)
|
||||||
|
sub.prop(crl, "denoising_glossy_indirect", text="Indirect", toggle=True)
|
||||||
|
|
||||||
|
row = layout.row()
|
||||||
|
row.label(text="Transmission:")
|
||||||
|
sub = row.row(align=True)
|
||||||
|
sub.prop(crl, "denoising_transmission_direct", text="Direct", toggle=True)
|
||||||
|
sub.prop(crl, "denoising_transmission_indirect", text="Indirect", toggle=True)
|
||||||
|
|
||||||
|
row = layout.row()
|
||||||
|
row.label(text="Subsurface:")
|
||||||
|
sub = row.row(align=True)
|
||||||
|
sub.prop(crl, "denoising_subsurface_direct", text="Direct", toggle=True)
|
||||||
|
sub.prop(crl, "denoising_subsurface_indirect", text="Indirect", toggle=True)
|
||||||
|
|
||||||
|
|
||||||
class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
|
class Cycles_PT_post_processing(CyclesButtonsPanel, Panel):
|
||||||
bl_label = "Post Processing"
|
bl_label = "Post Processing"
|
||||||
bl_options = {'DEFAULT_CLOSED'}
|
bl_options = {'DEFAULT_CLOSED'}
|
||||||
@@ -1734,6 +1805,7 @@ classes = (
|
|||||||
CyclesRender_PT_layer_options,
|
CyclesRender_PT_layer_options,
|
||||||
CyclesRender_PT_layer_passes,
|
CyclesRender_PT_layer_passes,
|
||||||
CyclesRender_PT_views,
|
CyclesRender_PT_views,
|
||||||
|
CyclesRender_PT_denoising,
|
||||||
Cycles_PT_post_processing,
|
Cycles_PT_post_processing,
|
||||||
CyclesCamera_PT_dof,
|
CyclesCamera_PT_dof,
|
||||||
Cycles_PT_context_material,
|
Cycles_PT_context_material,
|
||||||
|
@@ -299,12 +299,13 @@ static BL::RenderResult begin_render_result(BL::RenderEngine& b_engine,
|
|||||||
static void end_render_result(BL::RenderEngine& b_engine,
|
static void end_render_result(BL::RenderEngine& b_engine,
|
||||||
BL::RenderResult& b_rr,
|
BL::RenderResult& b_rr,
|
||||||
bool cancel,
|
bool cancel,
|
||||||
|
bool highlight,
|
||||||
bool do_merge_results)
|
bool do_merge_results)
|
||||||
{
|
{
|
||||||
b_engine.end_result(b_rr, (int)cancel, (int)do_merge_results);
|
b_engine.end_result(b_rr, (int)cancel, (int) highlight, (int)do_merge_results);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only)
|
void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight)
|
||||||
{
|
{
|
||||||
BufferParams& params = rtile.buffers->params;
|
BufferParams& params = rtile.buffers->params;
|
||||||
int x = params.full_x - session->tile_manager.params.full_x;
|
int x = params.full_x - session->tile_manager.params.full_x;
|
||||||
@@ -340,37 +341,37 @@ void BlenderSession::do_write_update_render_tile(RenderTile& rtile, bool do_upda
|
|||||||
update_render_result(b_rr, b_rlay, rtile);
|
update_render_result(b_rr, b_rlay, rtile);
|
||||||
}
|
}
|
||||||
|
|
||||||
end_render_result(b_engine, b_rr, true, true);
|
end_render_result(b_engine, b_rr, true, highlight, true);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/* write result */
|
/* write result */
|
||||||
write_render_result(b_rr, b_rlay, rtile);
|
write_render_result(b_rr, b_rlay, rtile);
|
||||||
end_render_result(b_engine, b_rr, false, true);
|
end_render_result(b_engine, b_rr, false, false, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void BlenderSession::write_render_tile(RenderTile& rtile)
|
void BlenderSession::write_render_tile(RenderTile& rtile)
|
||||||
{
|
{
|
||||||
do_write_update_render_tile(rtile, false);
|
do_write_update_render_tile(rtile, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BlenderSession::update_render_tile(RenderTile& rtile)
|
void BlenderSession::update_render_tile(RenderTile& rtile, bool highlight)
|
||||||
{
|
{
|
||||||
/* use final write for preview renders, otherwise render result wouldn't be
|
/* use final write for preview renders, otherwise render result wouldn't be
|
||||||
* be updated in blender side
|
* be updated in blender side
|
||||||
* would need to be investigated a bit further, but for now shall be fine
|
* would need to be investigated a bit further, but for now shall be fine
|
||||||
*/
|
*/
|
||||||
if(!b_engine.is_preview())
|
if(!b_engine.is_preview())
|
||||||
do_write_update_render_tile(rtile, true);
|
do_write_update_render_tile(rtile, true, highlight);
|
||||||
else
|
else
|
||||||
do_write_update_render_tile(rtile, false);
|
do_write_update_render_tile(rtile, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BlenderSession::render()
|
void BlenderSession::render()
|
||||||
{
|
{
|
||||||
/* set callback to write out render results */
|
/* set callback to write out render results */
|
||||||
session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
|
session->write_render_tile_cb = function_bind(&BlenderSession::write_render_tile, this, _1);
|
||||||
session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1);
|
session->update_render_tile_cb = function_bind(&BlenderSession::update_render_tile, this, _1, _2);
|
||||||
|
|
||||||
/* get buffer parameters */
|
/* get buffer parameters */
|
||||||
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
|
SessionParams session_params = BlenderSync::get_session_params(b_engine, b_userpref, b_scene, background);
|
||||||
@@ -391,7 +392,7 @@ void BlenderSession::render()
|
|||||||
|
|
||||||
/* layer will be missing if it was disabled in the UI */
|
/* layer will be missing if it was disabled in the UI */
|
||||||
if(b_single_rlay == b_rr.layers.end()) {
|
if(b_single_rlay == b_rr.layers.end()) {
|
||||||
end_render_result(b_engine, b_rr, true, false);
|
end_render_result(b_engine, b_rr, true, true, false);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -407,6 +408,29 @@ void BlenderSession::render()
|
|||||||
}
|
}
|
||||||
|
|
||||||
buffer_params.passes = passes;
|
buffer_params.passes = passes;
|
||||||
|
|
||||||
|
PointerRNA crl = RNA_pointer_get(&b_layer_iter->ptr, "cycles");
|
||||||
|
bool use_denoising = !session_params.progressive_refine && get_boolean(crl, "use_denoising");
|
||||||
|
buffer_params.denoising_data_pass = use_denoising;
|
||||||
|
session->tile_manager.schedule_denoising = use_denoising;
|
||||||
|
session->params.use_denoising = use_denoising;
|
||||||
|
scene->film->denoising_data_pass = buffer_params.denoising_data_pass;
|
||||||
|
scene->film->denoising_flags = 0;
|
||||||
|
if(!get_boolean(crl, "denoising_diffuse_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_DIR;
|
||||||
|
if(!get_boolean(crl, "denoising_diffuse_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_DIFFUSE_IND;
|
||||||
|
if(!get_boolean(crl, "denoising_glossy_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_DIR;
|
||||||
|
if(!get_boolean(crl, "denoising_glossy_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_GLOSSY_IND;
|
||||||
|
if(!get_boolean(crl, "denoising_transmission_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_DIR;
|
||||||
|
if(!get_boolean(crl, "denoising_transmission_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_TRANSMISSION_IND;
|
||||||
|
if(!get_boolean(crl, "denoising_subsurface_direct")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_DIR;
|
||||||
|
if(!get_boolean(crl, "denoising_subsurface_indirect")) scene->film->denoising_flags |= DENOISING_CLEAN_SUBSURFACE_IND;
|
||||||
|
scene->film->denoising_clean_pass = (scene->film->denoising_flags & DENOISING_CLEAN_ALL_PASSES);
|
||||||
|
buffer_params.denoising_clean_pass = scene->film->denoising_clean_pass;
|
||||||
|
session->params.denoising_radius = get_int(crl, "denoising_radius");
|
||||||
|
session->params.denoising_strength = get_float(crl, "denoising_strength");
|
||||||
|
session->params.denoising_feature_strength = get_float(crl, "denoising_feature_strength");
|
||||||
|
session->params.denoising_relative_pca = get_boolean(crl, "denoising_relative_pca");
|
||||||
|
|
||||||
scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
|
scene->film->pass_alpha_threshold = b_layer_iter->pass_alpha_threshold();
|
||||||
scene->film->tag_passes_update(scene, passes);
|
scene->film->tag_passes_update(scene, passes);
|
||||||
scene->film->tag_update(scene);
|
scene->film->tag_update(scene);
|
||||||
@@ -460,7 +484,7 @@ void BlenderSession::render()
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* free result without merging */
|
/* free result without merging */
|
||||||
end_render_result(b_engine, b_rr, true, false);
|
end_render_result(b_engine, b_rr, true, true, false);
|
||||||
|
|
||||||
if(session->progress.get_cancel())
|
if(session->progress.get_cancel())
|
||||||
break;
|
break;
|
||||||
@@ -666,6 +690,12 @@ void BlenderSession::do_write_update_render_result(BL::RenderResult& b_rr,
|
|||||||
/* copy pixels */
|
/* copy pixels */
|
||||||
read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
|
read = buffers->get_pass_rect(pass_type, exposure, sample, components, &pixels[0]);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
int denoising_offset = BlenderSync::get_denoising_pass(b_pass);
|
||||||
|
if(denoising_offset >= 0) {
|
||||||
|
read = buffers->get_denoising_pass_rect(denoising_offset, exposure, sample, components, &pixels[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(!read) {
|
if(!read) {
|
||||||
memset(&pixels[0], 0, pixels.size()*sizeof(float));
|
memset(&pixels[0], 0, pixels.size()*sizeof(float));
|
||||||
|
@@ -79,7 +79,7 @@ public:
|
|||||||
void update_render_result(BL::RenderResult& b_rr,
|
void update_render_result(BL::RenderResult& b_rr,
|
||||||
BL::RenderLayer& b_rlay,
|
BL::RenderLayer& b_rlay,
|
||||||
RenderTile& rtile);
|
RenderTile& rtile);
|
||||||
void update_render_tile(RenderTile& rtile);
|
void update_render_tile(RenderTile& rtile, bool highlight);
|
||||||
|
|
||||||
/* interactive updates */
|
/* interactive updates */
|
||||||
void synchronize();
|
void synchronize();
|
||||||
@@ -147,7 +147,7 @@ protected:
|
|||||||
BL::RenderLayer& b_rlay,
|
BL::RenderLayer& b_rlay,
|
||||||
RenderTile& rtile,
|
RenderTile& rtile,
|
||||||
bool do_update_only);
|
bool do_update_only);
|
||||||
void do_write_update_render_tile(RenderTile& rtile, bool do_update_only);
|
void do_write_update_render_tile(RenderTile& rtile, bool do_update_only, bool highlight);
|
||||||
|
|
||||||
int builtin_image_frame(const string &builtin_name);
|
int builtin_image_frame(const string &builtin_name);
|
||||||
void builtin_image_info(const string &builtin_name,
|
void builtin_image_info(const string &builtin_name,
|
||||||
|
@@ -525,6 +525,30 @@ PassType BlenderSync::get_pass_type(BL::RenderPass& b_pass)
|
|||||||
return PASS_NONE;
|
return PASS_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int BlenderSync::get_denoising_pass(BL::RenderPass& b_pass)
|
||||||
|
{
|
||||||
|
string name = b_pass.name();
|
||||||
|
if(name.substr(0, 10) != "Denoising ") {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
name = name.substr(10);
|
||||||
|
|
||||||
|
#define MAP_PASS(passname, offset) if(name == passname) return offset;
|
||||||
|
MAP_PASS("Normal", DENOISING_PASS_NORMAL);
|
||||||
|
MAP_PASS("Normal Variance", DENOISING_PASS_NORMAL_VAR);
|
||||||
|
MAP_PASS("Albedo", DENOISING_PASS_ALBEDO);
|
||||||
|
MAP_PASS("Albedo Variance", DENOISING_PASS_ALBEDO_VAR);
|
||||||
|
MAP_PASS("Depth", DENOISING_PASS_DEPTH);
|
||||||
|
MAP_PASS("Depth Variance", DENOISING_PASS_DEPTH_VAR);
|
||||||
|
MAP_PASS("Shadow A", DENOISING_PASS_SHADOW_A);
|
||||||
|
MAP_PASS("Shadow B", DENOISING_PASS_SHADOW_B);
|
||||||
|
MAP_PASS("Image", DENOISING_PASS_COLOR);
|
||||||
|
MAP_PASS("Image Variance", DENOISING_PASS_COLOR_VAR);
|
||||||
|
#undef MAP_PASS
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
|
array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
|
||||||
BL::SceneRenderLayer& b_srlay)
|
BL::SceneRenderLayer& b_srlay)
|
||||||
{
|
{
|
||||||
@@ -544,8 +568,20 @@ array<Pass> BlenderSync::sync_render_passes(BL::RenderLayer& b_rlay,
|
|||||||
Pass::add(pass_type, passes);
|
Pass::add(pass_type, passes);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __KERNEL_DEBUG__
|
|
||||||
PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
|
PointerRNA crp = RNA_pointer_get(&b_srlay.ptr, "cycles");
|
||||||
|
if(get_boolean(crp, "denoising_store_passes")) {
|
||||||
|
b_engine.add_pass("Denoising Normal", 3, "XYZ", b_srlay.name().c_str());
|
||||||
|
b_engine.add_pass("Denoising Normal Variance", 3, "XYZ", b_srlay.name().c_str());
|
||||||
|
b_engine.add_pass("Denoising Albedo", 3, "RGB", b_srlay.name().c_str());
|
||||||
|
b_engine.add_pass("Denoising Albedo Variance", 3, "RGB", b_srlay.name().c_str());
|
||||||
|
b_engine.add_pass("Denoising Depth", 1, "Z", b_srlay.name().c_str());
|
||||||
|
b_engine.add_pass("Denoising Depth Variance", 1, "Z", b_srlay.name().c_str());
|
||||||
|
b_engine.add_pass("Denoising Shadow A", 3, "XYV", b_srlay.name().c_str());
|
||||||
|
b_engine.add_pass("Denoising Shadow B", 3, "XYV", b_srlay.name().c_str());
|
||||||
|
b_engine.add_pass("Denoising Image", 3, "RGB", b_srlay.name().c_str());
|
||||||
|
b_engine.add_pass("Denoising Image Variance", 3, "RGB", b_srlay.name().c_str());
|
||||||
|
}
|
||||||
|
#ifdef __KERNEL_DEBUG__
|
||||||
if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
|
if(get_boolean(crp, "pass_debug_bvh_traversed_nodes")) {
|
||||||
b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
|
b_engine.add_pass("Debug BVH Traversed Nodes", 1, "X", b_srlay.name().c_str());
|
||||||
Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
|
Pass::add(PASS_BVH_TRAVERSED_NODES, passes);
|
||||||
|
@@ -96,6 +96,7 @@ public:
|
|||||||
int width, int height);
|
int width, int height);
|
||||||
|
|
||||||
static PassType get_pass_type(BL::RenderPass& b_pass);
|
static PassType get_pass_type(BL::RenderPass& b_pass);
|
||||||
|
static int get_denoising_pass(BL::RenderPass& b_pass);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/* sync */
|
/* sync */
|
||||||
|
@@ -25,6 +25,7 @@ set(SRC
|
|||||||
device.cpp
|
device.cpp
|
||||||
device_cpu.cpp
|
device_cpu.cpp
|
||||||
device_cuda.cpp
|
device_cuda.cpp
|
||||||
|
device_denoising.cpp
|
||||||
device_multi.cpp
|
device_multi.cpp
|
||||||
device_opencl.cpp
|
device_opencl.cpp
|
||||||
device_split_kernel.cpp
|
device_split_kernel.cpp
|
||||||
@@ -48,6 +49,7 @@ endif()
|
|||||||
|
|
||||||
set(SRC_HEADERS
|
set(SRC_HEADERS
|
||||||
device.h
|
device.h
|
||||||
|
device_denoising.h
|
||||||
device_memory.h
|
device_memory.h
|
||||||
device_intern.h
|
device_intern.h
|
||||||
device_network.h
|
device_network.h
|
||||||
|
@@ -402,4 +402,16 @@ void Device::free_memory()
|
|||||||
devices.free_memory();
|
devices.free_memory();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
device_sub_ptr::device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type)
|
||||||
|
: device(device)
|
||||||
|
{
|
||||||
|
ptr = device->mem_alloc_sub_ptr(mem, offset, size, type);
|
||||||
|
}
|
||||||
|
|
||||||
|
device_sub_ptr::~device_sub_ptr()
|
||||||
|
{
|
||||||
|
device->mem_free_sub_ptr(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
@@ -228,6 +228,7 @@ struct DeviceDrawParams {
|
|||||||
};
|
};
|
||||||
|
|
||||||
class Device {
|
class Device {
|
||||||
|
friend class device_sub_ptr;
|
||||||
protected:
|
protected:
|
||||||
Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {}
|
Device(DeviceInfo& info_, Stats &stats_, bool background) : background(background), vertex_buffer(0), info(info_), stats(stats_) {}
|
||||||
|
|
||||||
@@ -237,6 +238,14 @@ protected:
|
|||||||
/* used for real time display */
|
/* used for real time display */
|
||||||
unsigned int vertex_buffer;
|
unsigned int vertex_buffer;
|
||||||
|
|
||||||
|
virtual device_ptr mem_alloc_sub_ptr(device_memory& /*mem*/, int /*offset*/, int /*size*/, MemoryType /*type*/)
|
||||||
|
{
|
||||||
|
/* Only required for devices that implement denoising. */
|
||||||
|
assert(false);
|
||||||
|
return (device_ptr) 0;
|
||||||
|
}
|
||||||
|
virtual void mem_free_sub_ptr(device_ptr /*ptr*/) {};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
virtual ~Device();
|
virtual ~Device();
|
||||||
|
|
||||||
@@ -265,6 +274,8 @@ public:
|
|||||||
virtual void mem_zero(device_memory& mem) = 0;
|
virtual void mem_zero(device_memory& mem) = 0;
|
||||||
virtual void mem_free(device_memory& mem) = 0;
|
virtual void mem_free(device_memory& mem) = 0;
|
||||||
|
|
||||||
|
virtual int mem_address_alignment() { return 16; }
|
||||||
|
|
||||||
/* constant memory */
|
/* constant memory */
|
||||||
virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
|
virtual void const_copy_to(const char *name, void *host, size_t size) = 0;
|
||||||
|
|
||||||
@@ -312,6 +323,8 @@ public:
|
|||||||
/* multi device */
|
/* multi device */
|
||||||
virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
|
virtual void map_tile(Device * /*sub_device*/, RenderTile& /*tile*/) {}
|
||||||
virtual int device_number(Device * /*sub_device*/) { return 0; }
|
virtual int device_number(Device * /*sub_device*/) { return 0; }
|
||||||
|
virtual void map_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
|
||||||
|
virtual void unmap_neighbor_tiles(Device * /*sub_device*/, RenderTile * /*tiles*/) {}
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
|
static Device *create(DeviceInfo& info, Stats &stats, bool background = true);
|
||||||
|
@@ -25,6 +25,7 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "device/device.h"
|
#include "device/device.h"
|
||||||
|
#include "device/device_denoising.h"
|
||||||
#include "device/device_intern.h"
|
#include "device/device_intern.h"
|
||||||
#include "device/device_split_kernel.h"
|
#include "device/device_split_kernel.h"
|
||||||
|
|
||||||
@@ -34,6 +35,8 @@
|
|||||||
#include "kernel/split/kernel_split_data.h"
|
#include "kernel/split/kernel_split_data.h"
|
||||||
#include "kernel/kernel_globals.h"
|
#include "kernel/kernel_globals.h"
|
||||||
|
|
||||||
|
#include "kernel/filter/filter.h"
|
||||||
|
|
||||||
#include "kernel/osl/osl_shader.h"
|
#include "kernel/osl/osl_shader.h"
|
||||||
#include "kernel/osl/osl_globals.h"
|
#include "kernel/osl/osl_globals.h"
|
||||||
|
|
||||||
@@ -53,6 +56,83 @@ CCL_NAMESPACE_BEGIN
|
|||||||
|
|
||||||
class CPUDevice;
|
class CPUDevice;
|
||||||
|
|
||||||
|
/* Has to be outside of the class to be shared across template instantiations. */
|
||||||
|
static const char *logged_architecture = "";
|
||||||
|
|
||||||
|
template<typename F>
|
||||||
|
class KernelFunctions {
|
||||||
|
public:
|
||||||
|
KernelFunctions()
|
||||||
|
{
|
||||||
|
kernel = (F)NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
KernelFunctions(F kernel_default,
|
||||||
|
F kernel_sse2,
|
||||||
|
F kernel_sse3,
|
||||||
|
F kernel_sse41,
|
||||||
|
F kernel_avx,
|
||||||
|
F kernel_avx2)
|
||||||
|
{
|
||||||
|
const char *architecture_name = "default";
|
||||||
|
kernel = kernel_default;
|
||||||
|
|
||||||
|
/* Silence potential warnings about unused variables
|
||||||
|
* when compiling without some architectures. */
|
||||||
|
(void)kernel_sse2;
|
||||||
|
(void)kernel_sse3;
|
||||||
|
(void)kernel_sse41;
|
||||||
|
(void)kernel_avx;
|
||||||
|
(void)kernel_avx2;
|
||||||
|
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||||
|
if(system_cpu_support_avx2()) {
|
||||||
|
architecture_name = "AVX2";
|
||||||
|
kernel = kernel_avx2;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||||
|
if(system_cpu_support_avx()) {
|
||||||
|
architecture_name = "AVX";
|
||||||
|
kernel = kernel_avx;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||||
|
if(system_cpu_support_sse41()) {
|
||||||
|
architecture_name = "SSE4.1";
|
||||||
|
kernel = kernel_sse41;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||||
|
if(system_cpu_support_sse3()) {
|
||||||
|
architecture_name = "SSE3";
|
||||||
|
kernel = kernel_sse3;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||||
|
if(system_cpu_support_sse2()) {
|
||||||
|
architecture_name = "SSE2";
|
||||||
|
kernel = kernel_sse2;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if(strstr(architecture_name, logged_architecture) != 0) {
|
||||||
|
VLOG(1) << "Will be using " << architecture_name << " kernels.";
|
||||||
|
logged_architecture = architecture_name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline F operator()() const {
|
||||||
|
assert(kernel);
|
||||||
|
return kernel;
|
||||||
|
}
|
||||||
|
protected:
|
||||||
|
F kernel;
|
||||||
|
};
|
||||||
|
|
||||||
class CPUSplitKernel : public DeviceSplitKernel {
|
class CPUSplitKernel : public DeviceSplitKernel {
|
||||||
CPUDevice *device;
|
CPUDevice *device;
|
||||||
public:
|
public:
|
||||||
@@ -77,67 +157,6 @@ public:
|
|||||||
|
|
||||||
class CPUDevice : public Device
|
class CPUDevice : public Device
|
||||||
{
|
{
|
||||||
static unordered_map<string, void*> kernel_functions;
|
|
||||||
|
|
||||||
static void register_kernel_function(const char* name, void* func)
|
|
||||||
{
|
|
||||||
kernel_functions[name] = func;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char* get_arch_name()
|
|
||||||
{
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
|
||||||
if(system_cpu_support_avx2()) {
|
|
||||||
return "cpu_avx2";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
|
||||||
if(system_cpu_support_avx()) {
|
|
||||||
return "cpu_avx";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
|
||||||
if(system_cpu_support_sse41()) {
|
|
||||||
return "cpu_sse41";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
|
||||||
if(system_cpu_support_sse3()) {
|
|
||||||
return "cpu_sse3";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
|
||||||
if(system_cpu_support_sse2()) {
|
|
||||||
return "cpu_sse2";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
return "cpu";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename F>
|
|
||||||
static F get_kernel_function(string name)
|
|
||||||
{
|
|
||||||
name = string("kernel_") + get_arch_name() + "_" + name;
|
|
||||||
|
|
||||||
unordered_map<string, void*>::iterator it = kernel_functions.find(name);
|
|
||||||
|
|
||||||
if(it == kernel_functions.end()) {
|
|
||||||
assert(!"kernel function not found");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return (F)it->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
friend class CPUSplitKernel;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TaskPool task_pool;
|
TaskPool task_pool;
|
||||||
KernelGlobals kernel_globals;
|
KernelGlobals kernel_globals;
|
||||||
@@ -150,76 +169,88 @@ public:
|
|||||||
|
|
||||||
DeviceRequestedFeatures requested_features;
|
DeviceRequestedFeatures requested_features;
|
||||||
|
|
||||||
|
KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)> path_trace_kernel;
|
||||||
|
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel;
|
||||||
|
KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel;
|
||||||
|
KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
|
||||||
|
|
||||||
|
KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
|
||||||
|
KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)> filter_get_feature_kernel;
|
||||||
|
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)> filter_combine_halves_kernel;
|
||||||
|
|
||||||
|
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
|
||||||
|
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_blur_kernel;
|
||||||
|
KernelFunctions<void(*)(float*, float*, int*, int, int)> filter_nlm_calc_weight_kernel;
|
||||||
|
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
|
||||||
|
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
|
||||||
|
|
||||||
|
KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
|
||||||
|
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
|
||||||
|
KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
|
||||||
|
|
||||||
|
KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
|
||||||
|
ccl_global uint*, int, int, int, int, int, int, int, int, ccl_global int*, int,
|
||||||
|
ccl_global char*, ccl_global unsigned int*, unsigned int, ccl_global float*)> data_init_kernel;
|
||||||
|
unordered_map<string, KernelFunctions<void(*)(KernelGlobals*, KernelData*)> > split_kernels;
|
||||||
|
|
||||||
|
#define KERNEL_FUNCTIONS(name) \
|
||||||
|
KERNEL_NAME_EVAL(cpu, name), \
|
||||||
|
KERNEL_NAME_EVAL(cpu_sse2, name), \
|
||||||
|
KERNEL_NAME_EVAL(cpu_sse3, name), \
|
||||||
|
KERNEL_NAME_EVAL(cpu_sse41, name), \
|
||||||
|
KERNEL_NAME_EVAL(cpu_avx, name), \
|
||||||
|
KERNEL_NAME_EVAL(cpu_avx2, name)
|
||||||
|
|
||||||
CPUDevice(DeviceInfo& info, Stats &stats, bool background)
|
CPUDevice(DeviceInfo& info, Stats &stats, bool background)
|
||||||
: Device(info, stats, background)
|
: Device(info, stats, background),
|
||||||
|
#define REGISTER_KERNEL(name) name ## _kernel(KERNEL_FUNCTIONS(name))
|
||||||
|
REGISTER_KERNEL(path_trace),
|
||||||
|
REGISTER_KERNEL(convert_to_half_float),
|
||||||
|
REGISTER_KERNEL(convert_to_byte),
|
||||||
|
REGISTER_KERNEL(shader),
|
||||||
|
REGISTER_KERNEL(filter_divide_shadow),
|
||||||
|
REGISTER_KERNEL(filter_get_feature),
|
||||||
|
REGISTER_KERNEL(filter_combine_halves),
|
||||||
|
REGISTER_KERNEL(filter_nlm_calc_difference),
|
||||||
|
REGISTER_KERNEL(filter_nlm_blur),
|
||||||
|
REGISTER_KERNEL(filter_nlm_calc_weight),
|
||||||
|
REGISTER_KERNEL(filter_nlm_update_output),
|
||||||
|
REGISTER_KERNEL(filter_nlm_normalize),
|
||||||
|
REGISTER_KERNEL(filter_construct_transform),
|
||||||
|
REGISTER_KERNEL(filter_nlm_construct_gramian),
|
||||||
|
REGISTER_KERNEL(filter_finalize),
|
||||||
|
REGISTER_KERNEL(data_init)
|
||||||
|
#undef REGISTER_KERNEL
|
||||||
{
|
{
|
||||||
|
|
||||||
#ifdef WITH_OSL
|
#ifdef WITH_OSL
|
||||||
kernel_globals.osl = &osl_globals;
|
kernel_globals.osl = &osl_globals;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* do now to avoid thread issues */
|
|
||||||
system_cpu_support_sse2();
|
|
||||||
system_cpu_support_sse3();
|
|
||||||
system_cpu_support_sse41();
|
|
||||||
system_cpu_support_avx();
|
|
||||||
system_cpu_support_avx2();
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
|
||||||
if(system_cpu_support_avx2()) {
|
|
||||||
VLOG(1) << "Will be using AVX2 kernels.";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
|
||||||
if(system_cpu_support_avx()) {
|
|
||||||
VLOG(1) << "Will be using AVX kernels.";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
|
||||||
if(system_cpu_support_sse41()) {
|
|
||||||
VLOG(1) << "Will be using SSE4.1 kernels.";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
|
||||||
if(system_cpu_support_sse3()) {
|
|
||||||
VLOG(1) << "Will be using SSE3kernels.";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
|
||||||
if(system_cpu_support_sse2()) {
|
|
||||||
VLOG(1) << "Will be using SSE2 kernels.";
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
VLOG(1) << "Will be using regular kernels.";
|
|
||||||
}
|
|
||||||
|
|
||||||
use_split_kernel = DebugFlags().cpu.split_kernel;
|
use_split_kernel = DebugFlags().cpu.split_kernel;
|
||||||
if(use_split_kernel) {
|
if(use_split_kernel) {
|
||||||
VLOG(1) << "Will be using split kernel.";
|
VLOG(1) << "Will be using split kernel.";
|
||||||
}
|
}
|
||||||
|
|
||||||
kernel_cpu_register_functions(register_kernel_function);
|
#define REGISTER_SPLIT_KERNEL(name) split_kernels[#name] = KernelFunctions<void(*)(KernelGlobals*, KernelData*)>(KERNEL_FUNCTIONS(name))
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
REGISTER_SPLIT_KERNEL(path_init);
|
||||||
kernel_cpu_sse2_register_functions(register_kernel_function);
|
REGISTER_SPLIT_KERNEL(scene_intersect);
|
||||||
#endif
|
REGISTER_SPLIT_KERNEL(lamp_emission);
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
REGISTER_SPLIT_KERNEL(do_volume);
|
||||||
kernel_cpu_sse3_register_functions(register_kernel_function);
|
REGISTER_SPLIT_KERNEL(queue_enqueue);
|
||||||
#endif
|
REGISTER_SPLIT_KERNEL(indirect_background);
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
REGISTER_SPLIT_KERNEL(shader_setup);
|
||||||
kernel_cpu_sse41_register_functions(register_kernel_function);
|
REGISTER_SPLIT_KERNEL(shader_sort);
|
||||||
#endif
|
REGISTER_SPLIT_KERNEL(shader_eval);
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
|
||||||
kernel_cpu_avx_register_functions(register_kernel_function);
|
REGISTER_SPLIT_KERNEL(subsurface_scatter);
|
||||||
#endif
|
REGISTER_SPLIT_KERNEL(direct_lighting);
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
|
||||||
kernel_cpu_avx2_register_functions(register_kernel_function);
|
REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
|
||||||
#endif
|
REGISTER_SPLIT_KERNEL(next_iteration_setup);
|
||||||
|
REGISTER_SPLIT_KERNEL(indirect_subsurface);
|
||||||
|
REGISTER_SPLIT_KERNEL(buffer_update);
|
||||||
|
#undef REGISTER_SPLIT_KERNEL
|
||||||
|
#undef KERNEL_FUNCTIONS
|
||||||
}
|
}
|
||||||
|
|
||||||
~CPUDevice()
|
~CPUDevice()
|
||||||
@@ -273,13 +304,17 @@ public:
|
|||||||
if(!mem.data_pointer) {
|
if(!mem.data_pointer) {
|
||||||
free((void*)mem.device_pointer);
|
free((void*)mem.device_pointer);
|
||||||
}
|
}
|
||||||
|
|
||||||
mem.device_pointer = 0;
|
mem.device_pointer = 0;
|
||||||
stats.mem_free(mem.device_size);
|
stats.mem_free(mem.device_size);
|
||||||
mem.device_size = 0;
|
mem.device_size = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
|
||||||
|
{
|
||||||
|
return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
|
||||||
|
}
|
||||||
|
|
||||||
void const_copy_to(const char *name, void *host, size_t size)
|
void const_copy_to(const char *name, void *host, size_t size)
|
||||||
{
|
{
|
||||||
kernel_const_copy(&kernel_globals, name, host, size);
|
kernel_const_copy(&kernel_globals, name, host, size);
|
||||||
@@ -326,13 +361,8 @@ public:
|
|||||||
|
|
||||||
void thread_run(DeviceTask *task)
|
void thread_run(DeviceTask *task)
|
||||||
{
|
{
|
||||||
if(task->type == DeviceTask::PATH_TRACE) {
|
if(task->type == DeviceTask::RENDER) {
|
||||||
if(!use_split_kernel) {
|
thread_render(*task);
|
||||||
thread_path_trace(*task);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
thread_path_trace_split(*task);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else if(task->type == DeviceTask::FILM_CONVERT)
|
else if(task->type == DeviceTask::FILM_CONVERT)
|
||||||
thread_film_convert(*task);
|
thread_film_convert(*task);
|
||||||
@@ -349,53 +379,223 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void thread_path_trace(DeviceTask& task)
|
bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
|
||||||
{
|
{
|
||||||
if(task_pool.canceled()) {
|
mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
|
||||||
if(task.need_finish_queue == false)
|
|
||||||
return;
|
TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
|
||||||
|
for(int i = 0; i < 9; i++) {
|
||||||
|
tiles->buffers[i] = buffers[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
KernelGlobals kg = thread_kernel_globals_init();
|
return true;
|
||||||
RenderTile tile;
|
}
|
||||||
|
|
||||||
void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
|
bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
|
||||||
|
DenoisingTask *task)
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
|
||||||
if(system_cpu_support_avx2()) {
|
|
||||||
path_trace_kernel = kernel_cpu_avx2_path_trace;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
|
||||||
if(system_cpu_support_avx()) {
|
|
||||||
path_trace_kernel = kernel_cpu_avx_path_trace;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
|
||||||
if(system_cpu_support_sse41()) {
|
|
||||||
path_trace_kernel = kernel_cpu_sse41_path_trace;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
|
||||||
if(system_cpu_support_sse3()) {
|
|
||||||
path_trace_kernel = kernel_cpu_sse3_path_trace;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
|
||||||
if(system_cpu_support_sse2()) {
|
|
||||||
path_trace_kernel = kernel_cpu_sse2_path_trace;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
{
|
{
|
||||||
path_trace_kernel = kernel_cpu_path_trace;
|
int4 rect = task->rect;
|
||||||
|
int r = task->nlm_state.r;
|
||||||
|
int f = task->nlm_state.f;
|
||||||
|
float a = task->nlm_state.a;
|
||||||
|
float k_2 = task->nlm_state.k_2;
|
||||||
|
|
||||||
|
int w = align_up(rect.z-rect.x, 4);
|
||||||
|
int h = rect.w-rect.y;
|
||||||
|
|
||||||
|
float *blurDifference = (float*) task->nlm_state.temporary_1_ptr;
|
||||||
|
float *difference = (float*) task->nlm_state.temporary_2_ptr;
|
||||||
|
float *weightAccum = (float*) task->nlm_state.temporary_3_ptr;
|
||||||
|
|
||||||
|
memset(weightAccum, 0, sizeof(float)*w*h);
|
||||||
|
memset((float*) out_ptr, 0, sizeof(float)*w*h);
|
||||||
|
|
||||||
|
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||||
|
int dy = i / (2*r+1) - r;
|
||||||
|
int dx = i % (2*r+1) - r;
|
||||||
|
|
||||||
|
int local_rect[4] = {max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy)};
|
||||||
|
filter_nlm_calc_difference_kernel()(dx, dy,
|
||||||
|
(float*) guide_ptr,
|
||||||
|
(float*) variance_ptr,
|
||||||
|
difference,
|
||||||
|
local_rect,
|
||||||
|
w, 0,
|
||||||
|
a, k_2);
|
||||||
|
|
||||||
|
filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
|
||||||
|
filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
|
||||||
|
filter_nlm_blur_kernel() (difference, blurDifference, local_rect, w, f);
|
||||||
|
|
||||||
|
filter_nlm_update_output_kernel()(dx, dy,
|
||||||
|
blurDifference,
|
||||||
|
(float*) image_ptr,
|
||||||
|
(float*) out_ptr,
|
||||||
|
weightAccum,
|
||||||
|
local_rect,
|
||||||
|
w, f);
|
||||||
}
|
}
|
||||||
|
|
||||||
while(task.acquire_tile(this, tile)) {
|
int local_rect[4] = {0, 0, rect.z-rect.x, rect.w-rect.y};
|
||||||
|
filter_nlm_normalize_kernel()((float*) out_ptr, weightAccum, local_rect, w);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool denoising_construct_transform(DenoisingTask *task)
|
||||||
|
{
|
||||||
|
for(int y = 0; y < task->filter_area.w; y++) {
|
||||||
|
for(int x = 0; x < task->filter_area.z; x++) {
|
||||||
|
filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
|
||||||
|
x + task->filter_area.x,
|
||||||
|
y + task->filter_area.y,
|
||||||
|
y*task->filter_area.z + x,
|
||||||
|
(float*) task->storage.transform.device_pointer,
|
||||||
|
(int*) task->storage.rank.device_pointer,
|
||||||
|
&task->rect.x,
|
||||||
|
task->buffer.pass_stride,
|
||||||
|
task->radius,
|
||||||
|
task->pca_threshold);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool denoising_reconstruct(device_ptr color_ptr,
|
||||||
|
device_ptr color_variance_ptr,
|
||||||
|
device_ptr guide_ptr,
|
||||||
|
device_ptr guide_variance_ptr,
|
||||||
|
device_ptr output_ptr,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
mem_zero(task->storage.XtWX);
|
||||||
|
mem_zero(task->storage.XtWY);
|
||||||
|
|
||||||
|
float *difference = (float*) task->reconstruction_state.temporary_1_ptr;
|
||||||
|
float *blurDifference = (float*) task->reconstruction_state.temporary_2_ptr;
|
||||||
|
|
||||||
|
int r = task->radius;
|
||||||
|
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||||
|
int dy = i / (2*r+1) - r;
|
||||||
|
int dx = i % (2*r+1) - r;
|
||||||
|
|
||||||
|
int local_rect[4] = {max(0, -dx), max(0, -dy),
|
||||||
|
task->reconstruction_state.source_w - max(0, dx),
|
||||||
|
task->reconstruction_state.source_h - max(0, dy)};
|
||||||
|
filter_nlm_calc_difference_kernel()(dx, dy,
|
||||||
|
(float*) guide_ptr,
|
||||||
|
(float*) guide_variance_ptr,
|
||||||
|
difference,
|
||||||
|
local_rect,
|
||||||
|
task->buffer.w,
|
||||||
|
task->buffer.pass_stride,
|
||||||
|
1.0f,
|
||||||
|
task->nlm_k_2);
|
||||||
|
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
|
||||||
|
filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
|
||||||
|
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
|
||||||
|
filter_nlm_construct_gramian_kernel()(dx, dy,
|
||||||
|
blurDifference,
|
||||||
|
(float*) task->buffer.mem.device_pointer,
|
||||||
|
(float*) color_ptr,
|
||||||
|
(float*) color_variance_ptr,
|
||||||
|
(float*) task->storage.transform.device_pointer,
|
||||||
|
(int*) task->storage.rank.device_pointer,
|
||||||
|
(float*) task->storage.XtWX.device_pointer,
|
||||||
|
(float3*) task->storage.XtWY.device_pointer,
|
||||||
|
local_rect,
|
||||||
|
&task->reconstruction_state.filter_rect.x,
|
||||||
|
task->buffer.w,
|
||||||
|
task->buffer.h,
|
||||||
|
4,
|
||||||
|
task->buffer.pass_stride);
|
||||||
|
}
|
||||||
|
for(int y = 0; y < task->filter_area.w; y++) {
|
||||||
|
for(int x = 0; x < task->filter_area.z; x++) {
|
||||||
|
filter_finalize_kernel()(x,
|
||||||
|
y,
|
||||||
|
y*task->filter_area.z + x,
|
||||||
|
task->buffer.w,
|
||||||
|
task->buffer.h,
|
||||||
|
(float*) output_ptr,
|
||||||
|
(int*) task->storage.rank.device_pointer,
|
||||||
|
(float*) task->storage.XtWX.device_pointer,
|
||||||
|
(float3*) task->storage.XtWY.device_pointer,
|
||||||
|
&task->reconstruction_state.buffer_params.x,
|
||||||
|
task->render_buffer.samples);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
|
||||||
|
device_ptr mean_ptr, device_ptr variance_ptr,
|
||||||
|
int r, int4 rect, DenoisingTask *task)
|
||||||
|
{
|
||||||
|
(void) task;
|
||||||
|
for(int y = rect.y; y < rect.w; y++) {
|
||||||
|
for(int x = rect.x; x < rect.z; x++) {
|
||||||
|
filter_combine_halves_kernel()(x, y,
|
||||||
|
(float*) mean_ptr,
|
||||||
|
(float*) variance_ptr,
|
||||||
|
(float*) a_ptr,
|
||||||
|
(float*) b_ptr,
|
||||||
|
&rect.x,
|
||||||
|
r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
|
||||||
|
device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
|
||||||
|
device_ptr buffer_variance_ptr, DenoisingTask *task)
|
||||||
|
{
|
||||||
|
for(int y = task->rect.y; y < task->rect.w; y++) {
|
||||||
|
for(int x = task->rect.x; x < task->rect.z; x++) {
|
||||||
|
filter_divide_shadow_kernel()(task->render_buffer.samples,
|
||||||
|
task->tiles,
|
||||||
|
x, y,
|
||||||
|
(float*) a_ptr,
|
||||||
|
(float*) b_ptr,
|
||||||
|
(float*) sample_variance_ptr,
|
||||||
|
(float*) sv_variance_ptr,
|
||||||
|
(float*) buffer_variance_ptr,
|
||||||
|
&task->rect.x,
|
||||||
|
task->render_buffer.pass_stride,
|
||||||
|
task->render_buffer.denoising_data_offset,
|
||||||
|
use_split_kernel);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool denoising_get_feature(int mean_offset,
|
||||||
|
int variance_offset,
|
||||||
|
device_ptr mean_ptr,
|
||||||
|
device_ptr variance_ptr,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
for(int y = task->rect.y; y < task->rect.w; y++) {
|
||||||
|
for(int x = task->rect.x; x < task->rect.z; x++) {
|
||||||
|
filter_get_feature_kernel()(task->render_buffer.samples,
|
||||||
|
task->tiles,
|
||||||
|
mean_offset,
|
||||||
|
variance_offset,
|
||||||
|
x, y,
|
||||||
|
(float*) mean_ptr,
|
||||||
|
(float*) variance_ptr,
|
||||||
|
&task->rect.x,
|
||||||
|
task->render_buffer.pass_stride,
|
||||||
|
task->render_buffer.denoising_data_offset,
|
||||||
|
use_split_kernel);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void path_trace(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
|
||||||
|
{
|
||||||
float *render_buffer = (float*)tile.buffer;
|
float *render_buffer = (float*)tile.buffer;
|
||||||
uint *rng_state = (uint*)tile.rng_state;
|
uint *rng_state = (uint*)tile.rng_state;
|
||||||
int start_sample = tile.start_sample;
|
int start_sample = tile.start_sample;
|
||||||
@@ -409,7 +609,7 @@ public:
|
|||||||
|
|
||||||
for(int y = tile.y; y < tile.y + tile.h; y++) {
|
for(int y = tile.y; y < tile.y + tile.h; y++) {
|
||||||
for(int x = tile.x; x < tile.x + tile.w; x++) {
|
for(int x = tile.x; x < tile.x + tile.w; x++) {
|
||||||
path_trace_kernel(&kg, render_buffer, rng_state,
|
path_trace_kernel()(kg, render_buffer, rng_state,
|
||||||
sample, x, y, tile.offset, tile.stride);
|
sample, x, y, tile.offset, tile.stride);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -418,47 +618,80 @@ public:
|
|||||||
|
|
||||||
task.update_progress(&tile, tile.w*tile.h);
|
task.update_progress(&tile, tile.w*tile.h);
|
||||||
}
|
}
|
||||||
|
|
||||||
task.release_tile(tile);
|
|
||||||
|
|
||||||
if(task_pool.canceled()) {
|
|
||||||
if(task.need_finish_queue == false)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
thread_kernel_globals_free(&kg);
|
void denoise(DeviceTask &task, RenderTile &tile)
|
||||||
|
{
|
||||||
|
tile.sample = tile.start_sample + tile.num_samples;
|
||||||
|
|
||||||
|
DenoisingTask denoising(this);
|
||||||
|
|
||||||
|
denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
|
||||||
|
denoising.functions.reconstruct = function_bind(&CPUDevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
|
||||||
|
denoising.functions.divide_shadow = function_bind(&CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
|
||||||
|
denoising.functions.non_local_means = function_bind(&CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
|
||||||
|
denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
|
||||||
|
denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
|
||||||
|
denoising.functions.set_tiles = function_bind(&CPUDevice::denoising_set_tiles, this, _1, &denoising);
|
||||||
|
|
||||||
|
denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
|
||||||
|
denoising.render_buffer.samples = tile.sample;
|
||||||
|
|
||||||
|
RenderTile rtiles[9];
|
||||||
|
rtiles[4] = tile;
|
||||||
|
task.map_neighbor_tiles(rtiles, this);
|
||||||
|
denoising.tiles_from_rendertiles(rtiles);
|
||||||
|
|
||||||
|
denoising.init_from_devicetask(task);
|
||||||
|
|
||||||
|
denoising.run_denoising();
|
||||||
|
|
||||||
|
task.unmap_neighbor_tiles(rtiles, this);
|
||||||
|
|
||||||
|
task.update_progress(&tile, tile.w*tile.h);
|
||||||
}
|
}
|
||||||
|
|
||||||
void thread_path_trace_split(DeviceTask& task)
|
void thread_render(DeviceTask& task)
|
||||||
{
|
{
|
||||||
if(task_pool.canceled()) {
|
if(task_pool.canceled()) {
|
||||||
if(task.need_finish_queue == false)
|
if(task.need_finish_queue == false)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
RenderTile tile;
|
|
||||||
|
|
||||||
CPUSplitKernel split_kernel(this);
|
|
||||||
|
|
||||||
/* allocate buffer for kernel globals */
|
/* allocate buffer for kernel globals */
|
||||||
device_memory kgbuffer;
|
device_only_memory<KernelGlobals> kgbuffer;
|
||||||
kgbuffer.resize(sizeof(KernelGlobals));
|
kgbuffer.resize(1);
|
||||||
mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
|
mem_alloc("kernel_globals", kgbuffer, MEM_READ_WRITE);
|
||||||
|
|
||||||
KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
|
KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
|
||||||
|
|
||||||
|
CPUSplitKernel *split_kernel = NULL;
|
||||||
|
if(use_split_kernel) {
|
||||||
|
split_kernel = new CPUSplitKernel(this);
|
||||||
requested_features.max_closure = MAX_CLOSURE;
|
requested_features.max_closure = MAX_CLOSURE;
|
||||||
if(!split_kernel.load_kernels(requested_features)) {
|
if(!split_kernel->load_kernels(requested_features)) {
|
||||||
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
|
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
|
||||||
mem_free(kgbuffer);
|
mem_free(kgbuffer);
|
||||||
|
|
||||||
|
delete split_kernel;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
RenderTile tile;
|
||||||
while(task.acquire_tile(this, tile)) {
|
while(task.acquire_tile(this, tile)) {
|
||||||
|
if(tile.task == RenderTile::PATH_TRACE) {
|
||||||
|
if(use_split_kernel) {
|
||||||
device_memory data;
|
device_memory data;
|
||||||
split_kernel.path_trace(&task, tile, kgbuffer, data);
|
split_kernel->path_trace(&task, tile, kgbuffer, data);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
path_trace(task, tile, kg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(tile.task == RenderTile::DENOISE) {
|
||||||
|
denoise(task, tile);
|
||||||
|
}
|
||||||
|
|
||||||
task.release_tile(tile);
|
task.release_tile(tile);
|
||||||
|
|
||||||
@@ -470,6 +703,7 @@ public:
|
|||||||
|
|
||||||
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
|
thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
|
||||||
mem_free(kgbuffer);
|
mem_free(kgbuffer);
|
||||||
|
delete split_kernel;
|
||||||
}
|
}
|
||||||
|
|
||||||
void thread_film_convert(DeviceTask& task)
|
void thread_film_convert(DeviceTask& task)
|
||||||
@@ -477,85 +711,15 @@ public:
|
|||||||
float sample_scale = 1.0f/(task.sample + 1);
|
float sample_scale = 1.0f/(task.sample + 1);
|
||||||
|
|
||||||
if(task.rgba_half) {
|
if(task.rgba_half) {
|
||||||
void(*convert_to_half_float_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
|
||||||
if(system_cpu_support_avx2()) {
|
|
||||||
convert_to_half_float_kernel = kernel_cpu_avx2_convert_to_half_float;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
|
||||||
if(system_cpu_support_avx()) {
|
|
||||||
convert_to_half_float_kernel = kernel_cpu_avx_convert_to_half_float;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
|
||||||
if(system_cpu_support_sse41()) {
|
|
||||||
convert_to_half_float_kernel = kernel_cpu_sse41_convert_to_half_float;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
|
||||||
if(system_cpu_support_sse3()) {
|
|
||||||
convert_to_half_float_kernel = kernel_cpu_sse3_convert_to_half_float;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
|
||||||
if(system_cpu_support_sse2()) {
|
|
||||||
convert_to_half_float_kernel = kernel_cpu_sse2_convert_to_half_float;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
convert_to_half_float_kernel = kernel_cpu_convert_to_half_float;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int y = task.y; y < task.y + task.h; y++)
|
for(int y = task.y; y < task.y + task.h; y++)
|
||||||
for(int x = task.x; x < task.x + task.w; x++)
|
for(int x = task.x; x < task.x + task.w; x++)
|
||||||
convert_to_half_float_kernel(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
|
convert_to_half_float_kernel()(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
|
||||||
sample_scale, x, y, task.offset, task.stride);
|
sample_scale, x, y, task.offset, task.stride);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
void(*convert_to_byte_kernel)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int);
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
|
||||||
if(system_cpu_support_avx2()) {
|
|
||||||
convert_to_byte_kernel = kernel_cpu_avx2_convert_to_byte;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
|
||||||
if(system_cpu_support_avx()) {
|
|
||||||
convert_to_byte_kernel = kernel_cpu_avx_convert_to_byte;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
|
||||||
if(system_cpu_support_sse41()) {
|
|
||||||
convert_to_byte_kernel = kernel_cpu_sse41_convert_to_byte;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
|
||||||
if(system_cpu_support_sse3()) {
|
|
||||||
convert_to_byte_kernel = kernel_cpu_sse3_convert_to_byte;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
|
||||||
if(system_cpu_support_sse2()) {
|
|
||||||
convert_to_byte_kernel = kernel_cpu_sse2_convert_to_byte;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
convert_to_byte_kernel = kernel_cpu_convert_to_byte;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int y = task.y; y < task.y + task.h; y++)
|
for(int y = task.y; y < task.y + task.h; y++)
|
||||||
for(int x = task.x; x < task.x + task.w; x++)
|
for(int x = task.x; x < task.x + task.w; x++)
|
||||||
convert_to_byte_kernel(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
|
convert_to_byte_kernel()(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
|
||||||
sample_scale, x, y, task.offset, task.stride);
|
sample_scale, x, y, task.offset, task.stride);
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -568,45 +732,9 @@ public:
|
|||||||
#ifdef WITH_OSL
|
#ifdef WITH_OSL
|
||||||
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
|
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
|
||||||
#endif
|
#endif
|
||||||
void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int, int);
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
|
||||||
if(system_cpu_support_avx2()) {
|
|
||||||
shader_kernel = kernel_cpu_avx2_shader;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
|
||||||
if(system_cpu_support_avx()) {
|
|
||||||
shader_kernel = kernel_cpu_avx_shader;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
|
||||||
if(system_cpu_support_sse41()) {
|
|
||||||
shader_kernel = kernel_cpu_sse41_shader;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
|
||||||
if(system_cpu_support_sse3()) {
|
|
||||||
shader_kernel = kernel_cpu_sse3_shader;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
|
||||||
if(system_cpu_support_sse2()) {
|
|
||||||
shader_kernel = kernel_cpu_sse2_shader;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
shader_kernel = kernel_cpu_shader;
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int sample = 0; sample < task.num_samples; sample++) {
|
for(int sample = 0; sample < task.num_samples; sample++) {
|
||||||
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
|
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
|
||||||
shader_kernel(&kg,
|
shader_kernel()(&kg,
|
||||||
(uint4*)task.shader_input,
|
(uint4*)task.shader_input,
|
||||||
(float4*)task.shader_output,
|
(float4*)task.shader_output,
|
||||||
(float*)task.shader_output_luma,
|
(float*)task.shader_output_luma,
|
||||||
@@ -751,58 +879,6 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
|
|||||||
device_memory& use_queues_flags,
|
device_memory& use_queues_flags,
|
||||||
device_memory& work_pool_wgs)
|
device_memory& work_pool_wgs)
|
||||||
{
|
{
|
||||||
typedef void(*data_init_t)(KernelGlobals *kg,
|
|
||||||
ccl_constant KernelData *data,
|
|
||||||
ccl_global void *split_data_buffer,
|
|
||||||
int num_elements,
|
|
||||||
ccl_global char *ray_state,
|
|
||||||
ccl_global uint *rng_state,
|
|
||||||
int start_sample,
|
|
||||||
int end_sample,
|
|
||||||
int sx, int sy, int sw, int sh, int offset, int stride,
|
|
||||||
ccl_global int *Queue_index,
|
|
||||||
int queuesize,
|
|
||||||
ccl_global char *use_queues_flag,
|
|
||||||
ccl_global unsigned int *work_pool_wgs,
|
|
||||||
unsigned int num_samples,
|
|
||||||
ccl_global float *buffer);
|
|
||||||
|
|
||||||
data_init_t data_init;
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
|
||||||
if(system_cpu_support_avx2()) {
|
|
||||||
data_init = kernel_cpu_avx2_data_init;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
|
||||||
if(system_cpu_support_avx()) {
|
|
||||||
data_init = kernel_cpu_avx_data_init;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
|
||||||
if(system_cpu_support_sse41()) {
|
|
||||||
data_init = kernel_cpu_sse41_data_init;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
|
||||||
if(system_cpu_support_sse3()) {
|
|
||||||
data_init = kernel_cpu_sse3_data_init;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
|
||||||
if(system_cpu_support_sse2()) {
|
|
||||||
data_init = kernel_cpu_sse2_data_init;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
{
|
|
||||||
data_init = kernel_cpu_data_init;
|
|
||||||
}
|
|
||||||
|
|
||||||
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
|
KernelGlobals *kg = (KernelGlobals*)kernel_globals.device_pointer;
|
||||||
kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
|
kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
|
||||||
|
|
||||||
@@ -810,7 +886,7 @@ bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions& dim,
|
|||||||
for(int x = 0; x < dim.global_size[0]; x++) {
|
for(int x = 0; x < dim.global_size[0]; x++) {
|
||||||
kg->global_id = make_int2(x, y);
|
kg->global_id = make_int2(x, y);
|
||||||
|
|
||||||
data_init((KernelGlobals*)kernel_globals.device_pointer,
|
device->data_init_kernel()((KernelGlobals*)kernel_globals.device_pointer,
|
||||||
(KernelData*)data.device_pointer,
|
(KernelData*)data.device_pointer,
|
||||||
(void*)split_data.device_pointer,
|
(void*)split_data.device_pointer,
|
||||||
num_global_elements,
|
num_global_elements,
|
||||||
@@ -840,7 +916,7 @@ SplitKernelFunction* CPUSplitKernel::get_split_kernel_function(string kernel_nam
|
|||||||
{
|
{
|
||||||
CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
|
CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
|
||||||
|
|
||||||
kernel->func = device->get_kernel_function<void(*)(KernelGlobals*, KernelData*)>(kernel_name);
|
kernel->func = device->split_kernels[kernel_name]();
|
||||||
if(!kernel->func) {
|
if(!kernel->func) {
|
||||||
delete kernel;
|
delete kernel;
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -864,8 +940,6 @@ uint64_t CPUSplitKernel::state_buffer_size(device_memory& kernel_globals, device
|
|||||||
return split_data_buffer_size(kg, num_threads);
|
return split_data_buffer_size(kg, num_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
unordered_map<string, void*> CPUDevice::kernel_functions;
|
|
||||||
|
|
||||||
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
|
Device *device_cpu_create(DeviceInfo& info, Stats &stats, bool background)
|
||||||
{
|
{
|
||||||
return new CPUDevice(info, stats, background);
|
return new CPUDevice(info, stats, background);
|
||||||
|
@@ -21,11 +21,14 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "device/device.h"
|
#include "device/device.h"
|
||||||
|
#include "device/device_denoising.h"
|
||||||
#include "device/device_intern.h"
|
#include "device/device_intern.h"
|
||||||
#include "device/device_split_kernel.h"
|
#include "device/device_split_kernel.h"
|
||||||
|
|
||||||
#include "render/buffers.h"
|
#include "render/buffers.h"
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_defines.h"
|
||||||
|
|
||||||
#ifdef WITH_CUDA_DYNLOAD
|
#ifdef WITH_CUDA_DYNLOAD
|
||||||
# include "cuew.h"
|
# include "cuew.h"
|
||||||
#else
|
#else
|
||||||
@@ -113,7 +116,7 @@ public:
|
|||||||
DedicatedTaskPool task_pool;
|
DedicatedTaskPool task_pool;
|
||||||
CUdevice cuDevice;
|
CUdevice cuDevice;
|
||||||
CUcontext cuContext;
|
CUcontext cuContext;
|
||||||
CUmodule cuModule;
|
CUmodule cuModule, cuFilterModule;
|
||||||
map<device_ptr, bool> tex_interp_map;
|
map<device_ptr, bool> tex_interp_map;
|
||||||
map<device_ptr, uint> tex_bindless_map;
|
map<device_ptr, uint> tex_bindless_map;
|
||||||
int cuDevId;
|
int cuDevId;
|
||||||
@@ -170,7 +173,7 @@ public:
|
|||||||
CUresult result = stmt; \
|
CUresult result = stmt; \
|
||||||
\
|
\
|
||||||
if(result != CUDA_SUCCESS) { \
|
if(result != CUDA_SUCCESS) { \
|
||||||
string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
|
string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
|
||||||
if(error_msg == "") \
|
if(error_msg == "") \
|
||||||
error_msg = message; \
|
error_msg = message; \
|
||||||
fprintf(stderr, "%s\n", message.c_str()); \
|
fprintf(stderr, "%s\n", message.c_str()); \
|
||||||
@@ -301,7 +304,8 @@ public:
|
|||||||
* kernel sources md5 and only depends on compiler or compilation settings.
|
* kernel sources md5 and only depends on compiler or compilation settings.
|
||||||
*/
|
*/
|
||||||
string compile_kernel_get_common_cflags(
|
string compile_kernel_get_common_cflags(
|
||||||
const DeviceRequestedFeatures& requested_features, bool split=false)
|
const DeviceRequestedFeatures& requested_features,
|
||||||
|
bool filter=false, bool split=false)
|
||||||
{
|
{
|
||||||
const int cuda_version = cuewCompilerVersion();
|
const int cuda_version = cuewCompilerVersion();
|
||||||
const int machine = system_cpu_bits();
|
const int machine = system_cpu_bits();
|
||||||
@@ -316,7 +320,7 @@ public:
|
|||||||
machine,
|
machine,
|
||||||
cuda_version,
|
cuda_version,
|
||||||
include_path.c_str());
|
include_path.c_str());
|
||||||
if(use_adaptive_compilation()) {
|
if(!filter && use_adaptive_compilation()) {
|
||||||
cflags += " " + requested_features.get_build_options();
|
cflags += " " + requested_features.get_build_options();
|
||||||
}
|
}
|
||||||
const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
|
const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
|
||||||
@@ -364,8 +368,22 @@ public:
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
string compile_kernel(const DeviceRequestedFeatures& requested_features, bool split=false)
|
string compile_kernel(const DeviceRequestedFeatures& requested_features,
|
||||||
|
bool filter=false, bool split=false)
|
||||||
{
|
{
|
||||||
|
const char *name, *source;
|
||||||
|
if(filter) {
|
||||||
|
name = "filter";
|
||||||
|
source = "filter.cu";
|
||||||
|
}
|
||||||
|
else if(split) {
|
||||||
|
name = "kernel_split";
|
||||||
|
source = "kernel_split.cu";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
name = "kernel";
|
||||||
|
source = "kernel.cu";
|
||||||
|
}
|
||||||
/* Compute cubin name. */
|
/* Compute cubin name. */
|
||||||
int major, minor;
|
int major, minor;
|
||||||
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
|
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
|
||||||
@@ -373,9 +391,8 @@ public:
|
|||||||
|
|
||||||
/* Attempt to use kernel provided with Blender. */
|
/* Attempt to use kernel provided with Blender. */
|
||||||
if(!use_adaptive_compilation()) {
|
if(!use_adaptive_compilation()) {
|
||||||
const string cubin = path_get(string_printf(split ? "lib/kernel_split_sm_%d%d.cubin"
|
const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
|
||||||
: "lib/kernel_sm_%d%d.cubin",
|
name, major, minor));
|
||||||
major, minor));
|
|
||||||
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
|
VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
|
||||||
if(path_exists(cubin)) {
|
if(path_exists(cubin)) {
|
||||||
VLOG(1) << "Using precompiled kernel.";
|
VLOG(1) << "Using precompiled kernel.";
|
||||||
@@ -384,7 +401,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
const string common_cflags =
|
const string common_cflags =
|
||||||
compile_kernel_get_common_cflags(requested_features, split);
|
compile_kernel_get_common_cflags(requested_features, filter, split);
|
||||||
|
|
||||||
/* Try to use locally compiled kernel. */
|
/* Try to use locally compiled kernel. */
|
||||||
const string source_path = path_get("source");
|
const string source_path = path_get("source");
|
||||||
@@ -395,9 +412,8 @@ public:
|
|||||||
*/
|
*/
|
||||||
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
|
const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
|
||||||
|
|
||||||
const string cubin_file = string_printf(split ? "cycles_kernel_split_sm%d%d_%s.cubin"
|
const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
|
||||||
: "cycles_kernel_sm%d%d_%s.cubin",
|
name, major, minor,
|
||||||
major, minor,
|
|
||||||
cubin_md5.c_str());
|
cubin_md5.c_str());
|
||||||
const string cubin = path_cache_get(path_join("kernels", cubin_file));
|
const string cubin = path_cache_get(path_join("kernels", cubin_file));
|
||||||
VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
|
VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
|
||||||
@@ -432,7 +448,7 @@ public:
|
|||||||
const string kernel = path_join(
|
const string kernel = path_join(
|
||||||
path_join(source_path, "kernel"),
|
path_join(source_path, "kernel"),
|
||||||
path_join("kernels",
|
path_join("kernels",
|
||||||
path_join("cuda", split ? "kernel_split.cu" : "kernel.cu")));
|
path_join("cuda", source)));
|
||||||
double starttime = time_dt();
|
double starttime = time_dt();
|
||||||
printf("Compiling CUDA kernel ...\n");
|
printf("Compiling CUDA kernel ...\n");
|
||||||
|
|
||||||
@@ -480,11 +496,14 @@ public:
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
/* get kernel */
|
/* get kernel */
|
||||||
string cubin = compile_kernel(requested_features, use_split_kernel());
|
string cubin = compile_kernel(requested_features, false, use_split_kernel());
|
||||||
|
|
||||||
if(cubin == "")
|
if(cubin == "")
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
string filter_cubin = compile_kernel(requested_features, true, false);
|
||||||
|
if(filter_cubin == "")
|
||||||
|
return false;
|
||||||
|
|
||||||
/* open module */
|
/* open module */
|
||||||
cuda_push_context();
|
cuda_push_context();
|
||||||
|
|
||||||
@@ -499,6 +518,14 @@ public:
|
|||||||
if(cuda_error_(result, "cuModuleLoad"))
|
if(cuda_error_(result, "cuModuleLoad"))
|
||||||
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
|
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
|
||||||
|
|
||||||
|
if(path_read_text(filter_cubin, cubin_data))
|
||||||
|
result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
|
||||||
|
else
|
||||||
|
result = CUDA_ERROR_FILE_NOT_FOUND;
|
||||||
|
|
||||||
|
if(cuda_error_(result, "cuModuleLoad"))
|
||||||
|
cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
|
||||||
|
|
||||||
cuda_pop_context();
|
cuda_pop_context();
|
||||||
|
|
||||||
return (result == CUDA_SUCCESS);
|
return (result == CUDA_SUCCESS);
|
||||||
@@ -581,6 +608,11 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/, MemoryType /*type*/)
|
||||||
|
{
|
||||||
|
return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
|
||||||
|
}
|
||||||
|
|
||||||
void const_copy_to(const char *name, void *host, size_t size)
|
void const_copy_to(const char *name, void *host, size_t size)
|
||||||
{
|
{
|
||||||
CUdeviceptr mem;
|
CUdeviceptr mem;
|
||||||
@@ -881,6 +913,368 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool denoising_set_tiles(device_ptr *buffers, DenoisingTask *task)
|
||||||
|
{
|
||||||
|
mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_ONLY);
|
||||||
|
|
||||||
|
TilesInfo *tiles = (TilesInfo*) task->tiles_mem.data_pointer;
|
||||||
|
for(int i = 0; i < 9; i++) {
|
||||||
|
tiles->buffers[i] = buffers[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
mem_copy_to(task->tiles_mem);
|
||||||
|
|
||||||
|
return !have_error();
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CUDA_GET_BLOCKSIZE(func, w, h) \
|
||||||
|
int threads_per_block; \
|
||||||
|
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
|
||||||
|
int threads = (int)sqrt((float)threads_per_block); \
|
||||||
|
int xblocks = ((w) + threads - 1)/threads; \
|
||||||
|
int yblocks = ((h) + threads - 1)/threads;
|
||||||
|
|
||||||
|
#define CUDA_LAUNCH_KERNEL(func, args) \
|
||||||
|
cuda_assert(cuLaunchKernel(func, \
|
||||||
|
xblocks, yblocks, 1, \
|
||||||
|
threads, threads, 1, \
|
||||||
|
0, 0, args, 0));
|
||||||
|
|
||||||
|
bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
if(have_error())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
cuda_push_context();
|
||||||
|
|
||||||
|
int4 rect = task->rect;
|
||||||
|
int w = rect.z-rect.x;
|
||||||
|
int h = rect.w-rect.y;
|
||||||
|
int r = task->nlm_state.r;
|
||||||
|
int f = task->nlm_state.f;
|
||||||
|
float a = task->nlm_state.a;
|
||||||
|
float k_2 = task->nlm_state.k_2;
|
||||||
|
|
||||||
|
CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
|
||||||
|
CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
|
||||||
|
CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
|
||||||
|
|
||||||
|
cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
|
||||||
|
cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
|
||||||
|
|
||||||
|
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
|
||||||
|
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
|
||||||
|
CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
|
||||||
|
|
||||||
|
int dx, dy;
|
||||||
|
int4 local_rect;
|
||||||
|
int channel_offset = 0;
|
||||||
|
void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
|
||||||
|
void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
|
||||||
|
void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
|
||||||
|
void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
|
||||||
|
|
||||||
|
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||||
|
dy = i / (2*r+1) - r;
|
||||||
|
dx = i % (2*r+1) - r;
|
||||||
|
local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
|
||||||
|
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
|
||||||
|
}
|
||||||
|
|
||||||
|
local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
|
||||||
|
void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
|
||||||
|
cuda_assert(cuCtxSynchronize());
|
||||||
|
|
||||||
|
cuda_pop_context();
|
||||||
|
return !have_error();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool denoising_construct_transform(DenoisingTask *task)
|
||||||
|
{
|
||||||
|
if(have_error())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
cuda_push_context();
|
||||||
|
|
||||||
|
CUfunction cuFilterConstructTransform;
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
|
||||||
|
CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
|
||||||
|
task->storage.w,
|
||||||
|
task->storage.h);
|
||||||
|
|
||||||
|
void *args[] = {&task->buffer.mem.device_pointer,
|
||||||
|
&task->storage.transform.device_pointer,
|
||||||
|
&task->storage.rank.device_pointer,
|
||||||
|
&task->filter_area,
|
||||||
|
&task->rect,
|
||||||
|
&task->radius,
|
||||||
|
&task->pca_threshold,
|
||||||
|
&task->buffer.pass_stride};
|
||||||
|
CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
|
||||||
|
cuda_assert(cuCtxSynchronize());
|
||||||
|
|
||||||
|
cuda_pop_context();
|
||||||
|
return !have_error();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool denoising_reconstruct(device_ptr color_ptr,
|
||||||
|
device_ptr color_variance_ptr,
|
||||||
|
device_ptr guide_ptr,
|
||||||
|
device_ptr guide_variance_ptr,
|
||||||
|
device_ptr output_ptr,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
if(have_error())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
mem_zero(task->storage.XtWX);
|
||||||
|
mem_zero(task->storage.XtWY);
|
||||||
|
|
||||||
|
cuda_push_context();
|
||||||
|
|
||||||
|
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
|
||||||
|
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
|
||||||
|
CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
|
||||||
|
task->reconstruction_state.source_w,
|
||||||
|
task->reconstruction_state.source_h);
|
||||||
|
|
||||||
|
CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr;
|
||||||
|
CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
|
||||||
|
|
||||||
|
int r = task->radius;
|
||||||
|
int f = 4;
|
||||||
|
float a = 1.0f;
|
||||||
|
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||||
|
int dy = i / (2*r+1) - r;
|
||||||
|
int dx = i % (2*r+1) - r;
|
||||||
|
|
||||||
|
int local_rect[4] = {max(0, -dx), max(0, -dy),
|
||||||
|
task->reconstruction_state.source_w - max(0, dx),
|
||||||
|
task->reconstruction_state.source_h - max(0, dy)};
|
||||||
|
|
||||||
|
void *calc_difference_args[] = {&dx, &dy,
|
||||||
|
&guide_ptr,
|
||||||
|
&guide_variance_ptr,
|
||||||
|
&difference,
|
||||||
|
&local_rect,
|
||||||
|
&task->buffer.w,
|
||||||
|
&task->buffer.pass_stride,
|
||||||
|
&a,
|
||||||
|
&task->nlm_k_2};
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
|
||||||
|
|
||||||
|
void *blur_args[] = {&difference,
|
||||||
|
&blurDifference,
|
||||||
|
&local_rect,
|
||||||
|
&task->buffer.w,
|
||||||
|
&f};
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||||
|
|
||||||
|
void *calc_weight_args[] = {&blurDifference,
|
||||||
|
&difference,
|
||||||
|
&local_rect,
|
||||||
|
&task->buffer.w,
|
||||||
|
&f};
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
|
||||||
|
|
||||||
|
/* Reuse previous arguments. */
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||||
|
|
||||||
|
void *construct_gramian_args[] = {&dx, &dy,
|
||||||
|
&blurDifference,
|
||||||
|
&task->buffer.mem.device_pointer,
|
||||||
|
&color_ptr,
|
||||||
|
&color_variance_ptr,
|
||||||
|
&task->storage.transform.device_pointer,
|
||||||
|
&task->storage.rank.device_pointer,
|
||||||
|
&task->storage.XtWX.device_pointer,
|
||||||
|
&task->storage.XtWY.device_pointer,
|
||||||
|
&local_rect,
|
||||||
|
&task->reconstruction_state.filter_rect,
|
||||||
|
&task->buffer.w,
|
||||||
|
&task->buffer.h,
|
||||||
|
&f,
|
||||||
|
&task->buffer.pass_stride};
|
||||||
|
CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
|
||||||
|
}
|
||||||
|
|
||||||
|
void *finalize_args[] = {&task->buffer.w,
|
||||||
|
&task->buffer.h,
|
||||||
|
&output_ptr,
|
||||||
|
&task->storage.rank.device_pointer,
|
||||||
|
&task->storage.XtWX.device_pointer,
|
||||||
|
&task->storage.XtWY.device_pointer,
|
||||||
|
&task->filter_area,
|
||||||
|
&task->reconstruction_state.buffer_params.x,
|
||||||
|
&task->render_buffer.samples};
|
||||||
|
CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
|
||||||
|
cuda_assert(cuCtxSynchronize());
|
||||||
|
|
||||||
|
cuda_pop_context();
|
||||||
|
return !have_error();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
|
||||||
|
device_ptr mean_ptr, device_ptr variance_ptr,
|
||||||
|
int r, int4 rect, DenoisingTask *task)
|
||||||
|
{
|
||||||
|
(void) task;
|
||||||
|
|
||||||
|
if(have_error())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
cuda_push_context();
|
||||||
|
|
||||||
|
CUfunction cuFilterCombineHalves;
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
|
||||||
|
task->rect.z-task->rect.x,
|
||||||
|
task->rect.w-task->rect.y);
|
||||||
|
|
||||||
|
void *args[] = {&mean_ptr,
|
||||||
|
&variance_ptr,
|
||||||
|
&a_ptr,
|
||||||
|
&b_ptr,
|
||||||
|
&rect,
|
||||||
|
&r};
|
||||||
|
CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
|
||||||
|
cuda_assert(cuCtxSynchronize());
|
||||||
|
|
||||||
|
cuda_pop_context();
|
||||||
|
return !have_error();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
|
||||||
|
device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
|
||||||
|
device_ptr buffer_variance_ptr, DenoisingTask *task)
|
||||||
|
{
|
||||||
|
(void) task;
|
||||||
|
|
||||||
|
if(have_error())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
cuda_push_context();
|
||||||
|
|
||||||
|
CUfunction cuFilterDivideShadow;
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
|
||||||
|
task->rect.z-task->rect.x,
|
||||||
|
task->rect.w-task->rect.y);
|
||||||
|
|
||||||
|
bool use_split_variance = use_split_kernel();
|
||||||
|
void *args[] = {&task->render_buffer.samples,
|
||||||
|
&task->tiles_mem.device_pointer,
|
||||||
|
&a_ptr,
|
||||||
|
&b_ptr,
|
||||||
|
&sample_variance_ptr,
|
||||||
|
&sv_variance_ptr,
|
||||||
|
&buffer_variance_ptr,
|
||||||
|
&task->rect,
|
||||||
|
&task->render_buffer.pass_stride,
|
||||||
|
&task->render_buffer.denoising_data_offset,
|
||||||
|
&use_split_variance};
|
||||||
|
CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
|
||||||
|
cuda_assert(cuCtxSynchronize());
|
||||||
|
|
||||||
|
cuda_pop_context();
|
||||||
|
return !have_error();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool denoising_get_feature(int mean_offset,
|
||||||
|
int variance_offset,
|
||||||
|
device_ptr mean_ptr,
|
||||||
|
device_ptr variance_ptr,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
if(have_error())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
cuda_push_context();
|
||||||
|
|
||||||
|
CUfunction cuFilterGetFeature;
|
||||||
|
cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
|
||||||
|
cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
|
||||||
|
CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
|
||||||
|
task->rect.z-task->rect.x,
|
||||||
|
task->rect.w-task->rect.y);
|
||||||
|
|
||||||
|
bool use_split_variance = use_split_kernel();
|
||||||
|
void *args[] = {&task->render_buffer.samples,
|
||||||
|
&task->tiles_mem.device_pointer,
|
||||||
|
&mean_offset,
|
||||||
|
&variance_offset,
|
||||||
|
&mean_ptr,
|
||||||
|
&variance_ptr,
|
||||||
|
&task->rect,
|
||||||
|
&task->render_buffer.pass_stride,
|
||||||
|
&task->render_buffer.denoising_data_offset,
|
||||||
|
&use_split_variance};
|
||||||
|
CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
|
||||||
|
cuda_assert(cuCtxSynchronize());
|
||||||
|
|
||||||
|
cuda_pop_context();
|
||||||
|
return !have_error();
|
||||||
|
}
|
||||||
|
|
||||||
|
void denoise(RenderTile &rtile, const DeviceTask &task)
|
||||||
|
{
|
||||||
|
DenoisingTask denoising(this);
|
||||||
|
|
||||||
|
denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
|
||||||
|
denoising.functions.reconstruct = function_bind(&CUDADevice::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
|
||||||
|
denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
|
||||||
|
denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
|
||||||
|
denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
|
||||||
|
denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
|
||||||
|
denoising.functions.set_tiles = function_bind(&CUDADevice::denoising_set_tiles, this, _1, &denoising);
|
||||||
|
|
||||||
|
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
|
||||||
|
denoising.render_buffer.samples = rtile.sample;
|
||||||
|
|
||||||
|
RenderTile rtiles[9];
|
||||||
|
rtiles[4] = rtile;
|
||||||
|
task.map_neighbor_tiles(rtiles, this);
|
||||||
|
denoising.tiles_from_rendertiles(rtiles);
|
||||||
|
|
||||||
|
denoising.init_from_devicetask(task);
|
||||||
|
|
||||||
|
denoising.run_denoising();
|
||||||
|
|
||||||
|
task.unmap_neighbor_tiles(rtiles, this);
|
||||||
|
}
|
||||||
|
|
||||||
void path_trace(RenderTile& rtile, int sample, bool branched)
|
void path_trace(RenderTile& rtile, int sample, bool branched)
|
||||||
{
|
{
|
||||||
if(have_error())
|
if(have_error())
|
||||||
@@ -1305,7 +1699,7 @@ public:
|
|||||||
|
|
||||||
void thread_run(DeviceTask *task)
|
void thread_run(DeviceTask *task)
|
||||||
{
|
{
|
||||||
if(task->type == DeviceTask::PATH_TRACE) {
|
if(task->type == DeviceTask::RENDER) {
|
||||||
RenderTile tile;
|
RenderTile tile;
|
||||||
|
|
||||||
bool branched = task->integrator_branched;
|
bool branched = task->integrator_branched;
|
||||||
@@ -1313,9 +1707,26 @@ public:
|
|||||||
/* Upload Bindless Mapping */
|
/* Upload Bindless Mapping */
|
||||||
load_bindless_mapping();
|
load_bindless_mapping();
|
||||||
|
|
||||||
if(!use_split_kernel()) {
|
DeviceRequestedFeatures requested_features;
|
||||||
|
if(use_split_kernel()) {
|
||||||
|
if(!use_adaptive_compilation()) {
|
||||||
|
requested_features.max_closure = 64;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(split_kernel == NULL) {
|
||||||
|
split_kernel = new CUDASplitKernel(this);
|
||||||
|
split_kernel->load_kernels(requested_features);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* keep rendering tiles until done */
|
/* keep rendering tiles until done */
|
||||||
while(task->acquire_tile(this, tile)) {
|
while(task->acquire_tile(this, tile)) {
|
||||||
|
if(tile.task == RenderTile::PATH_TRACE) {
|
||||||
|
if(use_split_kernel()) {
|
||||||
|
device_memory void_buffer;
|
||||||
|
split_kernel->path_trace(task, tile, void_buffer, void_buffer);
|
||||||
|
}
|
||||||
|
else {
|
||||||
int start_sample = tile.start_sample;
|
int start_sample = tile.start_sample;
|
||||||
int end_sample = tile.start_sample + tile.num_samples;
|
int end_sample = tile.start_sample + tile.num_samples;
|
||||||
|
|
||||||
@@ -1331,25 +1742,16 @@ public:
|
|||||||
|
|
||||||
task->update_progress(&tile, tile.w*tile.h);
|
task->update_progress(&tile, tile.w*tile.h);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(tile.task == RenderTile::DENOISE) {
|
||||||
|
tile.sample = tile.start_sample + tile.num_samples;
|
||||||
|
|
||||||
task->release_tile(tile);
|
denoise(tile, *task);
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
DeviceRequestedFeatures requested_features;
|
|
||||||
if(!use_adaptive_compilation()) {
|
|
||||||
requested_features.max_closure = 64;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(split_kernel == NULL) {
|
task->update_progress(&tile, tile.w*tile.h);
|
||||||
split_kernel = new CUDASplitKernel(this);
|
|
||||||
split_kernel->load_kernels(requested_features);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
while(task->acquire_tile(this, tile)) {
|
|
||||||
device_memory void_buffer;
|
|
||||||
split_kernel->path_trace(task, tile, void_buffer, void_buffer);
|
|
||||||
|
|
||||||
task->release_tile(tile);
|
task->release_tile(tile);
|
||||||
|
|
||||||
if(task->get_cancel()) {
|
if(task->get_cancel()) {
|
||||||
@@ -1358,7 +1760,6 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
else if(task->type == DeviceTask::SHADER) {
|
else if(task->type == DeviceTask::SHADER) {
|
||||||
/* Upload Bindless Mapping */
|
/* Upload Bindless Mapping */
|
||||||
load_bindless_mapping();
|
load_bindless_mapping();
|
||||||
|
218
intern/cycles/device/device_denoising.cpp
Normal file
218
intern/cycles/device/device_denoising.cpp
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "device/device_denoising.h"
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_defines.h"
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
void DenoisingTask::init_from_devicetask(const DeviceTask &task)
|
||||||
|
{
|
||||||
|
radius = task.denoising_radius;
|
||||||
|
nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising_strength));
|
||||||
|
if(task.denoising_relative_pca) {
|
||||||
|
pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising_feature_strength));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising_feature_strength));
|
||||||
|
}
|
||||||
|
|
||||||
|
render_buffer.pass_stride = task.pass_stride;
|
||||||
|
render_buffer.denoising_data_offset = task.pass_denoising_data;
|
||||||
|
render_buffer.denoising_clean_offset = task.pass_denoising_clean;
|
||||||
|
|
||||||
|
/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
|
||||||
|
rect = make_int4(max(tiles->x[0], filter_area.x - radius),
|
||||||
|
max(tiles->y[0], filter_area.y - radius),
|
||||||
|
min(tiles->x[3], filter_area.x + filter_area.z + radius),
|
||||||
|
min(tiles->y[3], filter_area.y + filter_area.w + radius));
|
||||||
|
}
|
||||||
|
|
||||||
|
void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
|
||||||
|
{
|
||||||
|
tiles = (TilesInfo*) tiles_mem.resize(sizeof(TilesInfo)/sizeof(int));
|
||||||
|
|
||||||
|
device_ptr buffers[9];
|
||||||
|
for(int i = 0; i < 9; i++) {
|
||||||
|
buffers[i] = rtiles[i].buffer;
|
||||||
|
tiles->offsets[i] = rtiles[i].offset;
|
||||||
|
tiles->strides[i] = rtiles[i].stride;
|
||||||
|
}
|
||||||
|
tiles->x[0] = rtiles[3].x;
|
||||||
|
tiles->x[1] = rtiles[4].x;
|
||||||
|
tiles->x[2] = rtiles[5].x;
|
||||||
|
tiles->x[3] = rtiles[5].x + rtiles[5].w;
|
||||||
|
tiles->y[0] = rtiles[1].y;
|
||||||
|
tiles->y[1] = rtiles[4].y;
|
||||||
|
tiles->y[2] = rtiles[7].y;
|
||||||
|
tiles->y[3] = rtiles[7].y + rtiles[7].h;
|
||||||
|
|
||||||
|
render_buffer.offset = rtiles[4].offset;
|
||||||
|
render_buffer.stride = rtiles[4].stride;
|
||||||
|
render_buffer.ptr = rtiles[4].buffer;
|
||||||
|
|
||||||
|
functions.set_tiles(buffers);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool DenoisingTask::run_denoising()
|
||||||
|
{
|
||||||
|
/* Allocate denoising buffer. */
|
||||||
|
buffer.passes = 14;
|
||||||
|
buffer.w = align_up(rect.z - rect.x, 4);
|
||||||
|
buffer.h = rect.w - rect.y;
|
||||||
|
buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
|
||||||
|
buffer.mem.resize(buffer.pass_stride * buffer.passes);
|
||||||
|
device->mem_alloc("Denoising Pixel Buffer", buffer.mem, MEM_READ_WRITE);
|
||||||
|
|
||||||
|
device_ptr null_ptr = (device_ptr) 0;
|
||||||
|
|
||||||
|
/* Prefilter shadow feature. */
|
||||||
|
{
|
||||||
|
device_sub_ptr unfiltered_a (device, buffer.mem, 0, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr unfiltered_b (device, buffer.mem, 1*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr sample_var (device, buffer.mem, 2*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr sample_var_var (device, buffer.mem, 3*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr buffer_var (device, buffer.mem, 5*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr filtered_var (device, buffer.mem, 6*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr nlm_temporary_1(device, buffer.mem, 7*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr nlm_temporary_2(device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr nlm_temporary_3(device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
|
||||||
|
nlm_state.temporary_1_ptr = *nlm_temporary_1;
|
||||||
|
nlm_state.temporary_2_ptr = *nlm_temporary_2;
|
||||||
|
nlm_state.temporary_3_ptr = *nlm_temporary_3;
|
||||||
|
|
||||||
|
/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
|
||||||
|
functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
|
||||||
|
|
||||||
|
/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
|
||||||
|
nlm_state.set_parameters(6, 3, 4.0f, 1.0f);
|
||||||
|
functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
|
||||||
|
|
||||||
|
/* Reuse memory, the previous data isn't needed anymore. */
|
||||||
|
device_ptr filtered_a = *buffer_var,
|
||||||
|
filtered_b = *sample_var;
|
||||||
|
/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
|
||||||
|
nlm_state.set_parameters(5, 3, 1.0f, 0.25f);
|
||||||
|
functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
|
||||||
|
functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
|
||||||
|
|
||||||
|
device_ptr residual_var = *sample_var_var;
|
||||||
|
/* Estimate the residual variance between the two filtered halves. */
|
||||||
|
functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
|
||||||
|
|
||||||
|
device_ptr final_a = *unfiltered_a,
|
||||||
|
final_b = *unfiltered_b;
|
||||||
|
/* Use the residual variance for a second filter pass. */
|
||||||
|
nlm_state.set_parameters(4, 2, 1.0f, 0.5f);
|
||||||
|
functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
|
||||||
|
functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
|
||||||
|
|
||||||
|
/* Combine the two double-filtered halves to a final shadow feature. */
|
||||||
|
device_sub_ptr shadow_pass(device, buffer.mem, 4*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Prefilter general features. */
|
||||||
|
{
|
||||||
|
device_sub_ptr unfiltered (device, buffer.mem, 8*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr variance (device, buffer.mem, 9*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr nlm_temporary_1(device, buffer.mem, 10*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr nlm_temporary_2(device, buffer.mem, 11*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr nlm_temporary_3(device, buffer.mem, 12*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
|
||||||
|
nlm_state.temporary_1_ptr = *nlm_temporary_1;
|
||||||
|
nlm_state.temporary_2_ptr = *nlm_temporary_2;
|
||||||
|
nlm_state.temporary_3_ptr = *nlm_temporary_3;
|
||||||
|
|
||||||
|
int mean_from[] = { 0, 1, 2, 6, 7, 8, 12 };
|
||||||
|
int variance_from[] = { 3, 4, 5, 9, 10, 11, 13 };
|
||||||
|
int pass_to[] = { 1, 2, 3, 0, 5, 6, 7 };
|
||||||
|
for(int pass = 0; pass < 7; pass++) {
|
||||||
|
device_sub_ptr feature_pass(device, buffer.mem, pass_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
/* Get the unfiltered pass and its variance from the RenderBuffers. */
|
||||||
|
functions.get_feature(mean_from[pass], variance_from[pass], *unfiltered, *variance);
|
||||||
|
/* Smooth the pass and store the result in the denoising buffers. */
|
||||||
|
nlm_state.set_parameters(2, 2, 1.0f, 0.25f);
|
||||||
|
functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Copy color passes. */
|
||||||
|
{
|
||||||
|
int mean_from[] = {20, 21, 22};
|
||||||
|
int variance_from[] = {23, 24, 25};
|
||||||
|
int mean_to[] = { 8, 9, 10};
|
||||||
|
int variance_to[] = {11, 12, 13};
|
||||||
|
int num_color_passes = 3;
|
||||||
|
for(int pass = 0; pass < num_color_passes; pass++) {
|
||||||
|
device_sub_ptr color_pass (device, buffer.mem, mean_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr color_var_pass(device, buffer.mem, variance_to[pass]*buffer.pass_stride, buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
functions.get_feature(mean_from[pass], variance_from[pass], *color_pass, *color_var_pass);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
storage.w = filter_area.z;
|
||||||
|
storage.h = filter_area.w;
|
||||||
|
storage.transform.resize(storage.w*storage.h*TRANSFORM_SIZE);
|
||||||
|
storage.rank.resize(storage.w*storage.h);
|
||||||
|
device->mem_alloc("Denoising Transform", storage.transform, MEM_READ_WRITE);
|
||||||
|
device->mem_alloc("Denoising Rank", storage.rank, MEM_READ_WRITE);
|
||||||
|
|
||||||
|
functions.construct_transform();
|
||||||
|
|
||||||
|
device_only_memory<float> temporary_1;
|
||||||
|
device_only_memory<float> temporary_2;
|
||||||
|
temporary_1.resize(buffer.w*buffer.h);
|
||||||
|
temporary_2.resize(buffer.w*buffer.h);
|
||||||
|
device->mem_alloc("Denoising NLM temporary 1", temporary_1, MEM_READ_WRITE);
|
||||||
|
device->mem_alloc("Denoising NLM temporary 2", temporary_2, MEM_READ_WRITE);
|
||||||
|
reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
|
||||||
|
reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
|
||||||
|
|
||||||
|
storage.XtWX.resize(storage.w*storage.h*XTWX_SIZE);
|
||||||
|
storage.XtWY.resize(storage.w*storage.h*XTWY_SIZE);
|
||||||
|
device->mem_alloc("Denoising XtWX", storage.XtWX, MEM_READ_WRITE);
|
||||||
|
device->mem_alloc("Denoising XtWY", storage.XtWY, MEM_READ_WRITE);
|
||||||
|
|
||||||
|
reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
|
||||||
|
int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
|
||||||
|
reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
|
||||||
|
render_buffer.stride,
|
||||||
|
render_buffer.pass_stride,
|
||||||
|
render_buffer.denoising_clean_offset);
|
||||||
|
reconstruction_state.source_w = rect.z-rect.x;
|
||||||
|
reconstruction_state.source_h = rect.w-rect.y;
|
||||||
|
|
||||||
|
{
|
||||||
|
device_sub_ptr color_ptr (device, buffer.mem, 8*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
device_sub_ptr color_var_ptr(device, buffer.mem, 11*buffer.pass_stride, 3*buffer.pass_stride, MEM_READ_WRITE);
|
||||||
|
functions.reconstruct(*color_ptr, *color_var_ptr, *color_ptr, *color_var_ptr, render_buffer.ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
device->mem_free(storage.XtWX);
|
||||||
|
device->mem_free(storage.XtWY);
|
||||||
|
device->mem_free(storage.transform);
|
||||||
|
device->mem_free(storage.rank);
|
||||||
|
device->mem_free(temporary_1);
|
||||||
|
device->mem_free(temporary_2);
|
||||||
|
device->mem_free(buffer.mem);
|
||||||
|
device->mem_free(tiles_mem);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
145
intern/cycles/device/device_denoising.h
Normal file
145
intern/cycles/device/device_denoising.h
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __DEVICE_DENOISING_H__
|
||||||
|
#define __DEVICE_DENOISING_H__
|
||||||
|
|
||||||
|
#include "device/device.h"
|
||||||
|
|
||||||
|
#include "render/buffers.h"
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_defines.h"
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
class DenoisingTask {
|
||||||
|
public:
|
||||||
|
/* Parameters of the denoising algorithm. */
|
||||||
|
int radius;
|
||||||
|
float nlm_k_2;
|
||||||
|
float pca_threshold;
|
||||||
|
|
||||||
|
/* Pointer and parameters of the RenderBuffers. */
|
||||||
|
struct RenderBuffers {
|
||||||
|
int denoising_data_offset;
|
||||||
|
int denoising_clean_offset;
|
||||||
|
int pass_stride;
|
||||||
|
int offset;
|
||||||
|
int stride;
|
||||||
|
device_ptr ptr;
|
||||||
|
int samples;
|
||||||
|
} render_buffer;
|
||||||
|
|
||||||
|
TilesInfo *tiles;
|
||||||
|
device_vector<int> tiles_mem;
|
||||||
|
void tiles_from_rendertiles(RenderTile *rtiles);
|
||||||
|
|
||||||
|
int4 rect;
|
||||||
|
int4 filter_area;
|
||||||
|
|
||||||
|
struct DeviceFunctions {
|
||||||
|
function<bool(device_ptr image_ptr, /* Contains the values that are smoothed. */
|
||||||
|
device_ptr guide_ptr, /* Contains the values that are used to calculate weights. */
|
||||||
|
device_ptr variance_ptr, /* Contains the variance of the guide image. */
|
||||||
|
device_ptr out_ptr /* The filtered output is written into this image. */
|
||||||
|
)> non_local_means;
|
||||||
|
function<bool(device_ptr color_ptr,
|
||||||
|
device_ptr color_variance_ptr,
|
||||||
|
device_ptr guide_ptr,
|
||||||
|
device_ptr guide_variance_ptr,
|
||||||
|
device_ptr output_ptr
|
||||||
|
)> reconstruct;
|
||||||
|
function<bool()> construct_transform;
|
||||||
|
|
||||||
|
function<bool(device_ptr a_ptr,
|
||||||
|
device_ptr b_ptr,
|
||||||
|
device_ptr mean_ptr,
|
||||||
|
device_ptr variance_ptr,
|
||||||
|
int r,
|
||||||
|
int4 rect
|
||||||
|
)> combine_halves;
|
||||||
|
function<bool(device_ptr a_ptr,
|
||||||
|
device_ptr b_ptr,
|
||||||
|
device_ptr sample_variance_ptr,
|
||||||
|
device_ptr sv_variance_ptr,
|
||||||
|
device_ptr buffer_variance_ptr
|
||||||
|
)> divide_shadow;
|
||||||
|
function<bool(int mean_offset,
|
||||||
|
int variance_offset,
|
||||||
|
device_ptr mean_ptr,
|
||||||
|
device_ptr variance_ptr
|
||||||
|
)> get_feature;
|
||||||
|
function<bool(device_ptr*)> set_tiles;
|
||||||
|
} functions;
|
||||||
|
|
||||||
|
/* Stores state of the current Reconstruction operation,
|
||||||
|
* which is accessed by the device in order to perform the operation. */
|
||||||
|
struct ReconstructionState {
|
||||||
|
device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
|
||||||
|
device_ptr temporary_2_ptr;
|
||||||
|
|
||||||
|
int4 filter_rect;
|
||||||
|
int4 buffer_params;
|
||||||
|
|
||||||
|
int source_w;
|
||||||
|
int source_h;
|
||||||
|
} reconstruction_state;
|
||||||
|
|
||||||
|
/* Stores state of the current NLM operation,
|
||||||
|
* which is accessed by the device in order to perform the operation. */
|
||||||
|
struct NLMState {
|
||||||
|
device_ptr temporary_1_ptr; /* There three images are used as temporary storage. */
|
||||||
|
device_ptr temporary_2_ptr;
|
||||||
|
device_ptr temporary_3_ptr;
|
||||||
|
|
||||||
|
int r; /* Search radius of the filter. */
|
||||||
|
int f; /* Patch size of the filter. */
|
||||||
|
float a; /* Variance compensation factor in the MSE estimation. */
|
||||||
|
float k_2; /* Squared value of the k parameter of the filter. */
|
||||||
|
|
||||||
|
void set_parameters(int r_, int f_, float a_, float k_2_) { r = r_; f = f_; a = a_, k_2 = k_2_; }
|
||||||
|
} nlm_state;
|
||||||
|
|
||||||
|
struct Storage {
|
||||||
|
device_only_memory<float> transform;
|
||||||
|
device_only_memory<int> rank;
|
||||||
|
device_only_memory<float> XtWX;
|
||||||
|
device_only_memory<float3> XtWY;
|
||||||
|
int w;
|
||||||
|
int h;
|
||||||
|
} storage;
|
||||||
|
|
||||||
|
DenoisingTask(Device *device) : device(device) {}
|
||||||
|
|
||||||
|
void init_from_devicetask(const DeviceTask &task);
|
||||||
|
|
||||||
|
bool run_denoising();
|
||||||
|
|
||||||
|
struct DenoiseBuffers {
|
||||||
|
int pass_stride;
|
||||||
|
int passes;
|
||||||
|
int w;
|
||||||
|
int h;
|
||||||
|
device_only_memory<float> mem;
|
||||||
|
} buffer;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
Device *device;
|
||||||
|
};
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
|
#endif /* __DEVICE_DENOISING_H__ */
|
@@ -35,6 +35,8 @@
|
|||||||
|
|
||||||
CCL_NAMESPACE_BEGIN
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
class Device;
|
||||||
|
|
||||||
enum MemoryType {
|
enum MemoryType {
|
||||||
MEM_READ_ONLY,
|
MEM_READ_ONLY,
|
||||||
MEM_WRITE_ONLY,
|
MEM_WRITE_ONLY,
|
||||||
@@ -144,7 +146,7 @@ template<> struct device_type_traits<float2> {
|
|||||||
|
|
||||||
template<> struct device_type_traits<float3> {
|
template<> struct device_type_traits<float3> {
|
||||||
static const DataType data_type = TYPE_FLOAT;
|
static const DataType data_type = TYPE_FLOAT;
|
||||||
static const int num_elements = 3;
|
static const int num_elements = 4;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> struct device_type_traits<float4> {
|
template<> struct device_type_traits<float4> {
|
||||||
@@ -173,6 +175,9 @@ class device_memory
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
|
size_t memory_size() { return data_size*data_elements*datatype_size(data_type); }
|
||||||
|
size_t memory_elements_size(int elements) {
|
||||||
|
return elements*data_elements*datatype_size(data_type);
|
||||||
|
}
|
||||||
|
|
||||||
/* data information */
|
/* data information */
|
||||||
DataType data_type;
|
DataType data_type;
|
||||||
@@ -213,6 +218,22 @@ protected:
|
|||||||
device_memory& operator = (const device_memory&);
|
device_memory& operator = (const device_memory&);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
class device_only_memory : public device_memory
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
device_only_memory()
|
||||||
|
{
|
||||||
|
data_type = device_type_traits<T>::data_type;
|
||||||
|
data_elements = max(device_type_traits<T>::num_elements, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void resize(size_t num)
|
||||||
|
{
|
||||||
|
device_memory::resize(num*sizeof(T));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
/* Device Vector */
|
/* Device Vector */
|
||||||
|
|
||||||
template<typename T> class device_vector : public device_memory
|
template<typename T> class device_vector : public device_memory
|
||||||
@@ -299,6 +320,27 @@ private:
|
|||||||
array<T> data;
|
array<T> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/* A device_sub_ptr is a pointer into another existing memory.
|
||||||
|
* Therefore, it is not allocated separately, but just created from the already allocated base memory.
|
||||||
|
* It is freed automatically when it goes out of scope, which should happen before the base memory is freed.
|
||||||
|
* Note that some devices require the offset and size of the sub_ptr to be properly aligned. */
|
||||||
|
class device_sub_ptr
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
device_sub_ptr(Device *device, device_memory& mem, int offset, int size, MemoryType type);
|
||||||
|
~device_sub_ptr();
|
||||||
|
/* No copying. */
|
||||||
|
device_sub_ptr& operator = (const device_sub_ptr&);
|
||||||
|
|
||||||
|
device_ptr operator*() const
|
||||||
|
{
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
protected:
|
||||||
|
Device *device;
|
||||||
|
device_ptr ptr;
|
||||||
|
};
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
#endif /* __DEVICE_MEMORY_H__ */
|
#endif /* __DEVICE_MEMORY_H__ */
|
||||||
|
@@ -299,6 +299,60 @@ public:
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void map_neighbor_tiles(Device *sub_device, RenderTile *tiles)
|
||||||
|
{
|
||||||
|
for(int i = 0; i < 9; i++) {
|
||||||
|
if(!tiles[i].buffers) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* If the tile was rendered on another device, copy its memory to
|
||||||
|
* to the current device now, for the duration of the denoising task.
|
||||||
|
* Note that this temporarily modifies the RenderBuffers and calls
|
||||||
|
* the device, so this function is not thread safe. */
|
||||||
|
if(tiles[i].buffers->device != sub_device) {
|
||||||
|
device_vector<float> &mem = tiles[i].buffers->buffer;
|
||||||
|
|
||||||
|
tiles[i].buffers->copy_from_device();
|
||||||
|
device_ptr original_ptr = mem.device_pointer;
|
||||||
|
mem.device_pointer = 0;
|
||||||
|
sub_device->mem_alloc("Temporary memory for neighboring tile", mem, MEM_READ_WRITE);
|
||||||
|
sub_device->mem_copy_to(mem);
|
||||||
|
tiles[i].buffer = mem.device_pointer;
|
||||||
|
mem.device_pointer = original_ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void unmap_neighbor_tiles(Device * sub_device, RenderTile * tiles)
|
||||||
|
{
|
||||||
|
for(int i = 0; i < 9; i++) {
|
||||||
|
if(!tiles[i].buffers) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if(tiles[i].buffers->device != sub_device) {
|
||||||
|
device_vector<float> &mem = tiles[i].buffers->buffer;
|
||||||
|
|
||||||
|
device_ptr original_ptr = mem.device_pointer;
|
||||||
|
mem.device_pointer = tiles[i].buffer;
|
||||||
|
|
||||||
|
/* Copy denoised tile to the host. */
|
||||||
|
if(i == 4) {
|
||||||
|
tiles[i].buffers->copy_from_device(sub_device);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t mem_size = mem.device_size;
|
||||||
|
sub_device->mem_free(mem);
|
||||||
|
mem.device_pointer = original_ptr;
|
||||||
|
mem.device_size = mem_size;
|
||||||
|
|
||||||
|
/* Copy denoised tile to the original device. */
|
||||||
|
if(i == 4) {
|
||||||
|
tiles[i].buffers->device->mem_copy_to(mem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int get_split_task_count(DeviceTask& task)
|
int get_split_task_count(DeviceTask& task)
|
||||||
{
|
{
|
||||||
int total_tasks = 0;
|
int total_tasks = 0;
|
||||||
|
@@ -166,13 +166,13 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
|
|||||||
unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
|
unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
|
||||||
|
|
||||||
/* Allocate work_pool_wgs memory. */
|
/* Allocate work_pool_wgs memory. */
|
||||||
work_pool_wgs.resize(max_work_groups * sizeof(unsigned int));
|
work_pool_wgs.resize(max_work_groups);
|
||||||
device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
|
device->mem_alloc("work_pool_wgs", work_pool_wgs, MEM_READ_WRITE);
|
||||||
|
|
||||||
queue_index.resize(NUM_QUEUES * sizeof(int));
|
queue_index.resize(NUM_QUEUES);
|
||||||
device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
|
device->mem_alloc("queue_index", queue_index, MEM_READ_WRITE);
|
||||||
|
|
||||||
use_queues_flag.resize(sizeof(char));
|
use_queues_flag.resize(1);
|
||||||
device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
|
device->mem_alloc("use_queues_flag", use_queues_flag, MEM_READ_WRITE);
|
||||||
|
|
||||||
ray_state.resize(num_global_elements);
|
ray_state.resize(num_global_elements);
|
||||||
|
@@ -80,16 +80,16 @@ private:
|
|||||||
*/
|
*/
|
||||||
device_memory split_data;
|
device_memory split_data;
|
||||||
device_vector<uchar> ray_state;
|
device_vector<uchar> ray_state;
|
||||||
device_memory queue_index; /* Array of size num_queues * sizeof(int) that tracks the size of each queue. */
|
device_only_memory<int> queue_index; /* Array of size num_queues that tracks the size of each queue. */
|
||||||
|
|
||||||
/* Flag to make sceneintersect and lampemission kernel use queues. */
|
/* Flag to make sceneintersect and lampemission kernel use queues. */
|
||||||
device_memory use_queues_flag;
|
device_only_memory<char> use_queues_flag;
|
||||||
|
|
||||||
/* Approximate time it takes to complete one sample */
|
/* Approximate time it takes to complete one sample */
|
||||||
double avg_time_per_sample;
|
double avg_time_per_sample;
|
||||||
|
|
||||||
/* Work pool with respect to each work group. */
|
/* Work pool with respect to each work group. */
|
||||||
device_memory work_pool_wgs;
|
device_only_memory<unsigned int> work_pool_wgs;
|
||||||
|
|
||||||
/* clos_max value for which the kernels have been loaded currently. */
|
/* clos_max value for which the kernels have been loaded currently. */
|
||||||
int current_max_closure;
|
int current_max_closure;
|
||||||
|
@@ -56,7 +56,7 @@ int DeviceTask::get_subtask_count(int num, int max_size)
|
|||||||
if(type == SHADER) {
|
if(type == SHADER) {
|
||||||
num = min(shader_w, num);
|
num = min(shader_w, num);
|
||||||
}
|
}
|
||||||
else if(type == PATH_TRACE) {
|
else if(type == RENDER) {
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
num = min(h, num);
|
num = min(h, num);
|
||||||
@@ -82,7 +82,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
|
|||||||
tasks.push_back(task);
|
tasks.push_back(task);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if(type == PATH_TRACE) {
|
else if(type == RENDER) {
|
||||||
for(int i = 0; i < num; i++)
|
for(int i = 0; i < num; i++)
|
||||||
tasks.push_back(*this);
|
tasks.push_back(*this);
|
||||||
}
|
}
|
||||||
@@ -103,7 +103,7 @@ void DeviceTask::split(list<DeviceTask>& tasks, int num, int max_size)
|
|||||||
|
|
||||||
void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
|
void DeviceTask::update_progress(RenderTile *rtile, int pixel_samples)
|
||||||
{
|
{
|
||||||
if((type != PATH_TRACE) &&
|
if((type != RENDER) &&
|
||||||
(type != SHADER))
|
(type != SHADER))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
@@ -34,7 +34,7 @@ class Tile;
|
|||||||
|
|
||||||
class DeviceTask : public Task {
|
class DeviceTask : public Task {
|
||||||
public:
|
public:
|
||||||
typedef enum { PATH_TRACE, FILM_CONVERT, SHADER } Type;
|
typedef enum { RENDER, FILM_CONVERT, SHADER } Type;
|
||||||
Type type;
|
Type type;
|
||||||
|
|
||||||
int x, y, w, h;
|
int x, y, w, h;
|
||||||
@@ -53,7 +53,7 @@ public:
|
|||||||
|
|
||||||
int passes_size;
|
int passes_size;
|
||||||
|
|
||||||
explicit DeviceTask(Type type = PATH_TRACE);
|
explicit DeviceTask(Type type = RENDER);
|
||||||
|
|
||||||
int get_subtask_count(int num, int max_size = 0);
|
int get_subtask_count(int num, int max_size = 0);
|
||||||
void split(list<DeviceTask>& tasks, int num, int max_size = 0);
|
void split(list<DeviceTask>& tasks, int num, int max_size = 0);
|
||||||
@@ -65,6 +65,16 @@ public:
|
|||||||
function<void(RenderTile&)> update_tile_sample;
|
function<void(RenderTile&)> update_tile_sample;
|
||||||
function<void(RenderTile&)> release_tile;
|
function<void(RenderTile&)> release_tile;
|
||||||
function<bool(void)> get_cancel;
|
function<bool(void)> get_cancel;
|
||||||
|
function<void(RenderTile*, Device*)> map_neighbor_tiles;
|
||||||
|
function<void(RenderTile*, Device*)> unmap_neighbor_tiles;
|
||||||
|
|
||||||
|
int denoising_radius;
|
||||||
|
float denoising_strength;
|
||||||
|
float denoising_feature_strength;
|
||||||
|
bool denoising_relative_pca;
|
||||||
|
int pass_stride;
|
||||||
|
int pass_denoising_data;
|
||||||
|
int pass_denoising_clean;
|
||||||
|
|
||||||
bool need_finish_queue;
|
bool need_finish_queue;
|
||||||
bool integrator_branched;
|
bool integrator_branched;
|
||||||
|
@@ -17,6 +17,7 @@
|
|||||||
#ifdef WITH_OPENCL
|
#ifdef WITH_OPENCL
|
||||||
|
|
||||||
#include "device/device.h"
|
#include "device/device.h"
|
||||||
|
#include "device/device_denoising.h"
|
||||||
|
|
||||||
#include "util/util_map.h"
|
#include "util/util_map.h"
|
||||||
#include "util/util_param.h"
|
#include "util/util_param.h"
|
||||||
@@ -129,6 +130,8 @@ public:
|
|||||||
cl_int* error = NULL);
|
cl_int* error = NULL);
|
||||||
static cl_device_type get_device_type(cl_device_id device_id);
|
static cl_device_type get_device_type(cl_device_id device_id);
|
||||||
|
|
||||||
|
static int mem_address_alignment(cl_device_id device_id);
|
||||||
|
|
||||||
/* Get somewhat more readable device name.
|
/* Get somewhat more readable device name.
|
||||||
* Main difference is AMD OpenCL here which only gives code name
|
* Main difference is AMD OpenCL here which only gives code name
|
||||||
* for the regular device name. This will give more sane device
|
* for the regular device name. This will give more sane device
|
||||||
@@ -218,7 +221,7 @@ public:
|
|||||||
cl_int err = stmt; \
|
cl_int err = stmt; \
|
||||||
\
|
\
|
||||||
if(err != CL_SUCCESS) { \
|
if(err != CL_SUCCESS) { \
|
||||||
string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
|
string message = string_printf("OpenCL error: %s in %s (%s:%d)", clewErrorString(err), #stmt, __FILE__, __LINE__); \
|
||||||
if(error_msg == "") \
|
if(error_msg == "") \
|
||||||
error_msg = message; \
|
error_msg = message; \
|
||||||
fprintf(stderr, "%s\n", message.c_str()); \
|
fprintf(stderr, "%s\n", message.c_str()); \
|
||||||
@@ -282,7 +285,7 @@ public:
|
|||||||
map<ustring, cl_kernel> kernels;
|
map<ustring, cl_kernel> kernels;
|
||||||
};
|
};
|
||||||
|
|
||||||
OpenCLProgram base_program;
|
OpenCLProgram base_program, denoising_program;
|
||||||
|
|
||||||
typedef map<string, device_vector<uchar>*> ConstMemMap;
|
typedef map<string, device_vector<uchar>*> ConstMemMap;
|
||||||
typedef map<string, device_ptr> MemMap;
|
typedef map<string, device_ptr> MemMap;
|
||||||
@@ -320,6 +323,9 @@ public:
|
|||||||
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
|
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem);
|
||||||
void mem_zero(device_memory& mem);
|
void mem_zero(device_memory& mem);
|
||||||
void mem_free(device_memory& mem);
|
void mem_free(device_memory& mem);
|
||||||
|
|
||||||
|
int mem_address_alignment();
|
||||||
|
|
||||||
void const_copy_to(const char *name, void *host, size_t size);
|
void const_copy_to(const char *name, void *host, size_t size);
|
||||||
void tex_alloc(const char *name,
|
void tex_alloc(const char *name,
|
||||||
device_memory& mem,
|
device_memory& mem,
|
||||||
@@ -328,12 +334,14 @@ public:
|
|||||||
void tex_free(device_memory& mem);
|
void tex_free(device_memory& mem);
|
||||||
|
|
||||||
size_t global_size_round_up(int group_size, int global_size);
|
size_t global_size_round_up(int group_size, int global_size);
|
||||||
void enqueue_kernel(cl_kernel kernel, size_t w, size_t h);
|
void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
|
||||||
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
|
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
|
||||||
|
|
||||||
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
|
void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half);
|
||||||
void shader(DeviceTask& task);
|
void shader(DeviceTask& task);
|
||||||
|
|
||||||
|
void denoise(RenderTile& tile, const DeviceTask& task);
|
||||||
|
|
||||||
class OpenCLDeviceTask : public DeviceTask {
|
class OpenCLDeviceTask : public DeviceTask {
|
||||||
public:
|
public:
|
||||||
OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
|
OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
|
||||||
@@ -367,9 +375,48 @@ public:
|
|||||||
|
|
||||||
virtual void thread_run(DeviceTask * /*task*/) = 0;
|
virtual void thread_run(DeviceTask * /*task*/) = 0;
|
||||||
|
|
||||||
|
virtual bool is_split_kernel() = 0;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
string kernel_build_options(const string *debug_src = NULL);
|
string kernel_build_options(const string *debug_src = NULL);
|
||||||
|
|
||||||
|
void mem_zero_kernel(device_ptr ptr, size_t size);
|
||||||
|
|
||||||
|
bool denoising_non_local_means(device_ptr image_ptr,
|
||||||
|
device_ptr guide_ptr,
|
||||||
|
device_ptr variance_ptr,
|
||||||
|
device_ptr out_ptr,
|
||||||
|
DenoisingTask *task);
|
||||||
|
bool denoising_construct_transform(DenoisingTask *task);
|
||||||
|
bool denoising_reconstruct(device_ptr color_ptr,
|
||||||
|
device_ptr color_variance_ptr,
|
||||||
|
device_ptr guide_ptr,
|
||||||
|
device_ptr guide_variance_ptr,
|
||||||
|
device_ptr output_ptr,
|
||||||
|
DenoisingTask *task);
|
||||||
|
bool denoising_combine_halves(device_ptr a_ptr,
|
||||||
|
device_ptr b_ptr,
|
||||||
|
device_ptr mean_ptr,
|
||||||
|
device_ptr variance_ptr,
|
||||||
|
int r, int4 rect,
|
||||||
|
DenoisingTask *task);
|
||||||
|
bool denoising_divide_shadow(device_ptr a_ptr,
|
||||||
|
device_ptr b_ptr,
|
||||||
|
device_ptr sample_variance_ptr,
|
||||||
|
device_ptr sv_variance_ptr,
|
||||||
|
device_ptr buffer_variance_ptr,
|
||||||
|
DenoisingTask *task);
|
||||||
|
bool denoising_get_feature(int mean_offset,
|
||||||
|
int variance_offset,
|
||||||
|
device_ptr mean_ptr,
|
||||||
|
device_ptr variance_ptr,
|
||||||
|
DenoisingTask *task);
|
||||||
|
bool denoising_set_tiles(device_ptr *buffers,
|
||||||
|
DenoisingTask *task);
|
||||||
|
|
||||||
|
device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type);
|
||||||
|
void mem_free_sub_ptr(device_ptr ptr);
|
||||||
|
|
||||||
class ArgumentWrapper {
|
class ArgumentWrapper {
|
||||||
public:
|
public:
|
||||||
ArgumentWrapper() : size(0), pointer(NULL)
|
ArgumentWrapper() : size(0), pointer(NULL)
|
||||||
|
@@ -213,8 +213,23 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
|
|||||||
base_program.add_kernel(ustring("bake"));
|
base_program.add_kernel(ustring("bake"));
|
||||||
base_program.add_kernel(ustring("zero_buffer"));
|
base_program.add_kernel(ustring("zero_buffer"));
|
||||||
|
|
||||||
|
denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
|
||||||
|
denoising_program.add_kernel(ustring("filter_divide_shadow"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_get_feature"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_combine_halves"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_construct_transform"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_nlm_blur"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_nlm_update_output"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_nlm_normalize"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_finalize"));
|
||||||
|
denoising_program.add_kernel(ustring("filter_set_tiles"));
|
||||||
|
|
||||||
vector<OpenCLProgram*> programs;
|
vector<OpenCLProgram*> programs;
|
||||||
programs.push_back(&base_program);
|
programs.push_back(&base_program);
|
||||||
|
programs.push_back(&denoising_program);
|
||||||
/* Call actual class to fill the vector with its programs. */
|
/* Call actual class to fill the vector with its programs. */
|
||||||
if(!load_kernels(requested_features, programs)) {
|
if(!load_kernels(requested_features, programs)) {
|
||||||
return false;
|
return false;
|
||||||
@@ -322,21 +337,19 @@ void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, in
|
|||||||
NULL, NULL));
|
NULL, NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
void OpenCLDeviceBase::mem_zero(device_memory& mem)
|
void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
|
||||||
{
|
{
|
||||||
if(mem.device_pointer) {
|
|
||||||
if(base_program.is_loaded()) {
|
|
||||||
cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
|
cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
|
||||||
|
|
||||||
size_t global_size[] = {1024, 1024};
|
size_t global_size[] = {1024, 1024};
|
||||||
size_t num_threads = global_size[0] * global_size[1];
|
size_t num_threads = global_size[0] * global_size[1];
|
||||||
|
|
||||||
cl_mem d_buffer = CL_MEM_PTR(mem.device_pointer);
|
cl_mem d_buffer = CL_MEM_PTR(mem);
|
||||||
cl_ulong d_offset = 0;
|
cl_ulong d_offset = 0;
|
||||||
cl_ulong d_size = 0;
|
cl_ulong d_size = 0;
|
||||||
|
|
||||||
while(d_offset < mem.memory_size()) {
|
while(d_offset < size) {
|
||||||
d_size = std::min<cl_ulong>(num_threads*sizeof(float4), mem.memory_size() - d_offset);
|
d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
|
||||||
|
|
||||||
kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
|
kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
|
||||||
|
|
||||||
@@ -353,6 +366,13 @@ void OpenCLDeviceBase::mem_zero(device_memory& mem)
|
|||||||
|
|
||||||
d_offset += d_size;
|
d_offset += d_size;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void OpenCLDeviceBase::mem_zero(device_memory& mem)
|
||||||
|
{
|
||||||
|
if(mem.device_pointer) {
|
||||||
|
if(base_program.is_loaded()) {
|
||||||
|
mem_zero_kernel(mem.device_pointer, mem.memory_size());
|
||||||
}
|
}
|
||||||
|
|
||||||
if(mem.data_pointer) {
|
if(mem.data_pointer) {
|
||||||
@@ -396,6 +416,41 @@ void OpenCLDeviceBase::mem_free(device_memory& mem)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int OpenCLDeviceBase::mem_address_alignment()
|
||||||
|
{
|
||||||
|
return OpenCLInfo::mem_address_alignment(cdDevice);
|
||||||
|
}
|
||||||
|
|
||||||
|
device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size, MemoryType type)
|
||||||
|
{
|
||||||
|
cl_mem_flags mem_flag;
|
||||||
|
if(type == MEM_READ_ONLY)
|
||||||
|
mem_flag = CL_MEM_READ_ONLY;
|
||||||
|
else if(type == MEM_WRITE_ONLY)
|
||||||
|
mem_flag = CL_MEM_WRITE_ONLY;
|
||||||
|
else
|
||||||
|
mem_flag = CL_MEM_READ_WRITE;
|
||||||
|
|
||||||
|
cl_buffer_region info;
|
||||||
|
info.origin = mem.memory_elements_size(offset);
|
||||||
|
info.size = mem.memory_elements_size(size);
|
||||||
|
|
||||||
|
device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
|
||||||
|
mem_flag,
|
||||||
|
CL_BUFFER_CREATE_TYPE_REGION,
|
||||||
|
&info,
|
||||||
|
&ciErr);
|
||||||
|
opencl_assert_err(ciErr, "clCreateSubBuffer");
|
||||||
|
return sub_buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
|
||||||
|
{
|
||||||
|
if(device_pointer && device_pointer != null_mem) {
|
||||||
|
opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
|
void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
|
||||||
{
|
{
|
||||||
ConstMemMap::iterator i = const_mem_map.find(name);
|
ConstMemMap::iterator i = const_mem_map.find(name);
|
||||||
@@ -449,7 +504,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
|
|||||||
return global_size + ((r == 0)? 0: group_size - r);
|
return global_size + ((r == 0)? 0: group_size - r);
|
||||||
}
|
}
|
||||||
|
|
||||||
void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
|
void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
|
||||||
{
|
{
|
||||||
size_t workgroup_size, max_work_items[3];
|
size_t workgroup_size, max_work_items[3];
|
||||||
|
|
||||||
@@ -458,6 +513,10 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
|
|||||||
clGetDeviceInfo(cdDevice,
|
clGetDeviceInfo(cdDevice,
|
||||||
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
|
CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
|
||||||
|
|
||||||
|
if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
|
||||||
|
workgroup_size = max_workgroup_size;
|
||||||
|
}
|
||||||
|
|
||||||
/* Try to divide evenly over 2 dimensions. */
|
/* Try to divide evenly over 2 dimensions. */
|
||||||
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
|
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
|
||||||
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
|
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
|
||||||
@@ -543,6 +602,362 @@ set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
|
|||||||
enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
|
enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
|
||||||
|
device_ptr guide_ptr,
|
||||||
|
device_ptr variance_ptr,
|
||||||
|
device_ptr out_ptr,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
int4 rect = task->rect;
|
||||||
|
int w = rect.z-rect.x;
|
||||||
|
int h = rect.w-rect.y;
|
||||||
|
int r = task->nlm_state.r;
|
||||||
|
int f = task->nlm_state.f;
|
||||||
|
float a = task->nlm_state.a;
|
||||||
|
float k_2 = task->nlm_state.k_2;
|
||||||
|
|
||||||
|
cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
|
||||||
|
cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
|
||||||
|
cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
|
||||||
|
|
||||||
|
cl_mem image_mem = CL_MEM_PTR(image_ptr);
|
||||||
|
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
|
||||||
|
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
|
||||||
|
cl_mem out_mem = CL_MEM_PTR(out_ptr);
|
||||||
|
|
||||||
|
mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
|
||||||
|
mem_zero_kernel(out_ptr, sizeof(float)*w*h);
|
||||||
|
|
||||||
|
cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
|
||||||
|
cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
|
||||||
|
cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
|
||||||
|
cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
|
||||||
|
cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
|
||||||
|
|
||||||
|
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||||
|
int dy = i / (2*r+1) - r;
|
||||||
|
int dx = i % (2*r+1) - r;
|
||||||
|
int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
|
||||||
|
kernel_set_args(ckNLMCalcDifference, 0,
|
||||||
|
dx, dy, guide_mem, variance_mem,
|
||||||
|
difference, local_rect, w, 0, a, k_2);
|
||||||
|
kernel_set_args(ckNLMBlur, 0,
|
||||||
|
difference, blurDifference, local_rect, w, f);
|
||||||
|
kernel_set_args(ckNLMCalcWeight, 0,
|
||||||
|
blurDifference, difference, local_rect, w, f);
|
||||||
|
kernel_set_args(ckNLMUpdateOutput, 0,
|
||||||
|
dx, dy, blurDifference, image_mem,
|
||||||
|
out_mem, weightAccum, local_rect, w, f);
|
||||||
|
|
||||||
|
enqueue_kernel(ckNLMCalcDifference, w, h);
|
||||||
|
enqueue_kernel(ckNLMBlur, w, h);
|
||||||
|
enqueue_kernel(ckNLMCalcWeight, w, h);
|
||||||
|
enqueue_kernel(ckNLMBlur, w, h);
|
||||||
|
enqueue_kernel(ckNLMUpdateOutput, w, h);
|
||||||
|
}
|
||||||
|
|
||||||
|
int4 local_rect = make_int4(0, 0, w, h);
|
||||||
|
kernel_set_args(ckNLMNormalize, 0,
|
||||||
|
out_mem, weightAccum, local_rect, w);
|
||||||
|
enqueue_kernel(ckNLMNormalize, w, h);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
|
||||||
|
{
|
||||||
|
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
|
||||||
|
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
|
||||||
|
cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
|
||||||
|
|
||||||
|
cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
|
||||||
|
|
||||||
|
kernel_set_args(ckFilterConstructTransform, 0,
|
||||||
|
buffer_mem,
|
||||||
|
transform_mem,
|
||||||
|
rank_mem,
|
||||||
|
task->filter_area,
|
||||||
|
task->rect,
|
||||||
|
task->buffer.pass_stride,
|
||||||
|
task->radius,
|
||||||
|
task->pca_threshold);
|
||||||
|
|
||||||
|
enqueue_kernel(ckFilterConstructTransform,
|
||||||
|
task->storage.w,
|
||||||
|
task->storage.h,
|
||||||
|
256);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
|
||||||
|
device_ptr color_variance_ptr,
|
||||||
|
device_ptr guide_ptr,
|
||||||
|
device_ptr guide_variance_ptr,
|
||||||
|
device_ptr output_ptr,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
mem_zero(task->storage.XtWX);
|
||||||
|
mem_zero(task->storage.XtWY);
|
||||||
|
|
||||||
|
cl_mem color_mem = CL_MEM_PTR(color_ptr);
|
||||||
|
cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
|
||||||
|
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
|
||||||
|
cl_mem guide_variance_mem = CL_MEM_PTR(guide_variance_ptr);
|
||||||
|
cl_mem output_mem = CL_MEM_PTR(output_ptr);
|
||||||
|
|
||||||
|
cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
|
||||||
|
cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
|
||||||
|
cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
|
||||||
|
cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
|
||||||
|
cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
|
||||||
|
|
||||||
|
cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
|
||||||
|
cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
|
||||||
|
cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
|
||||||
|
cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
|
||||||
|
cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
|
||||||
|
|
||||||
|
cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
|
||||||
|
cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
|
||||||
|
|
||||||
|
int r = task->radius;
|
||||||
|
int f = 4;
|
||||||
|
float a = 1.0f;
|
||||||
|
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||||
|
int dy = i / (2*r+1) - r;
|
||||||
|
int dx = i % (2*r+1) - r;
|
||||||
|
|
||||||
|
int local_rect[4] = {max(0, -dx), max(0, -dy),
|
||||||
|
task->reconstruction_state.source_w - max(0, dx),
|
||||||
|
task->reconstruction_state.source_h - max(0, dy)};
|
||||||
|
|
||||||
|
kernel_set_args(ckNLMCalcDifference, 0,
|
||||||
|
dx, dy,
|
||||||
|
guide_mem,
|
||||||
|
guide_variance_mem,
|
||||||
|
difference,
|
||||||
|
local_rect,
|
||||||
|
task->buffer.w,
|
||||||
|
task->buffer.pass_stride,
|
||||||
|
a, task->nlm_k_2);
|
||||||
|
enqueue_kernel(ckNLMCalcDifference,
|
||||||
|
task->reconstruction_state.source_w,
|
||||||
|
task->reconstruction_state.source_h);
|
||||||
|
|
||||||
|
kernel_set_args(ckNLMBlur, 0,
|
||||||
|
difference,
|
||||||
|
blurDifference,
|
||||||
|
local_rect,
|
||||||
|
task->buffer.w,
|
||||||
|
f);
|
||||||
|
enqueue_kernel(ckNLMBlur,
|
||||||
|
task->reconstruction_state.source_w,
|
||||||
|
task->reconstruction_state.source_h);
|
||||||
|
|
||||||
|
kernel_set_args(ckNLMCalcWeight, 0,
|
||||||
|
blurDifference,
|
||||||
|
difference,
|
||||||
|
local_rect,
|
||||||
|
task->buffer.w,
|
||||||
|
f);
|
||||||
|
enqueue_kernel(ckNLMCalcWeight,
|
||||||
|
task->reconstruction_state.source_w,
|
||||||
|
task->reconstruction_state.source_h);
|
||||||
|
|
||||||
|
/* Reuse previous arguments. */
|
||||||
|
enqueue_kernel(ckNLMBlur,
|
||||||
|
task->reconstruction_state.source_w,
|
||||||
|
task->reconstruction_state.source_h);
|
||||||
|
|
||||||
|
kernel_set_args(ckNLMConstructGramian, 0,
|
||||||
|
dx, dy,
|
||||||
|
blurDifference,
|
||||||
|
buffer_mem,
|
||||||
|
color_mem,
|
||||||
|
color_variance_mem,
|
||||||
|
transform_mem,
|
||||||
|
rank_mem,
|
||||||
|
XtWX_mem,
|
||||||
|
XtWY_mem,
|
||||||
|
local_rect,
|
||||||
|
task->reconstruction_state.filter_rect,
|
||||||
|
task->buffer.w,
|
||||||
|
task->buffer.h,
|
||||||
|
f,
|
||||||
|
task->buffer.pass_stride);
|
||||||
|
enqueue_kernel(ckNLMConstructGramian,
|
||||||
|
task->reconstruction_state.source_w,
|
||||||
|
task->reconstruction_state.source_h,
|
||||||
|
256);
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel_set_args(ckFinalize, 0,
|
||||||
|
task->buffer.w,
|
||||||
|
task->buffer.h,
|
||||||
|
output_mem,
|
||||||
|
rank_mem,
|
||||||
|
XtWX_mem,
|
||||||
|
XtWY_mem,
|
||||||
|
task->filter_area,
|
||||||
|
task->reconstruction_state.buffer_params,
|
||||||
|
task->render_buffer.samples);
|
||||||
|
enqueue_kernel(ckFinalize,
|
||||||
|
task->reconstruction_state.source_w,
|
||||||
|
task->reconstruction_state.source_h);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
|
||||||
|
device_ptr b_ptr,
|
||||||
|
device_ptr mean_ptr,
|
||||||
|
device_ptr variance_ptr,
|
||||||
|
int r, int4 rect,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
(void) task;
|
||||||
|
|
||||||
|
cl_mem a_mem = CL_MEM_PTR(a_ptr);
|
||||||
|
cl_mem b_mem = CL_MEM_PTR(b_ptr);
|
||||||
|
cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
|
||||||
|
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
|
||||||
|
|
||||||
|
cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
|
||||||
|
|
||||||
|
kernel_set_args(ckFilterCombineHalves, 0,
|
||||||
|
mean_mem,
|
||||||
|
variance_mem,
|
||||||
|
a_mem,
|
||||||
|
b_mem,
|
||||||
|
rect,
|
||||||
|
r);
|
||||||
|
enqueue_kernel(ckFilterCombineHalves,
|
||||||
|
task->rect.z-task->rect.x,
|
||||||
|
task->rect.w-task->rect.y);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
|
||||||
|
device_ptr b_ptr,
|
||||||
|
device_ptr sample_variance_ptr,
|
||||||
|
device_ptr sv_variance_ptr,
|
||||||
|
device_ptr buffer_variance_ptr,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
(void) task;
|
||||||
|
|
||||||
|
cl_mem a_mem = CL_MEM_PTR(a_ptr);
|
||||||
|
cl_mem b_mem = CL_MEM_PTR(b_ptr);
|
||||||
|
cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
|
||||||
|
cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
|
||||||
|
cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
|
||||||
|
|
||||||
|
cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
|
||||||
|
|
||||||
|
cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
|
||||||
|
|
||||||
|
char split_kernel = is_split_kernel()? 1 : 0;
|
||||||
|
kernel_set_args(ckFilterDivideShadow, 0,
|
||||||
|
task->render_buffer.samples,
|
||||||
|
tiles_mem,
|
||||||
|
a_mem,
|
||||||
|
b_mem,
|
||||||
|
sample_variance_mem,
|
||||||
|
sv_variance_mem,
|
||||||
|
buffer_variance_mem,
|
||||||
|
task->rect,
|
||||||
|
task->render_buffer.pass_stride,
|
||||||
|
task->render_buffer.denoising_data_offset,
|
||||||
|
split_kernel);
|
||||||
|
enqueue_kernel(ckFilterDivideShadow,
|
||||||
|
task->rect.z-task->rect.x,
|
||||||
|
task->rect.w-task->rect.y);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
|
||||||
|
int variance_offset,
|
||||||
|
device_ptr mean_ptr,
|
||||||
|
device_ptr variance_ptr,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
|
||||||
|
cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
|
||||||
|
|
||||||
|
cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
|
||||||
|
|
||||||
|
cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
|
||||||
|
|
||||||
|
char split_kernel = is_split_kernel()? 1 : 0;
|
||||||
|
kernel_set_args(ckFilterGetFeature, 0,
|
||||||
|
task->render_buffer.samples,
|
||||||
|
tiles_mem,
|
||||||
|
mean_offset,
|
||||||
|
variance_offset,
|
||||||
|
mean_mem,
|
||||||
|
variance_mem,
|
||||||
|
task->rect,
|
||||||
|
task->render_buffer.pass_stride,
|
||||||
|
task->render_buffer.denoising_data_offset,
|
||||||
|
split_kernel);
|
||||||
|
enqueue_kernel(ckFilterGetFeature,
|
||||||
|
task->rect.z-task->rect.x,
|
||||||
|
task->rect.w-task->rect.y);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
|
||||||
|
DenoisingTask *task)
|
||||||
|
{
|
||||||
|
mem_alloc("Denoising Tile Info", task->tiles_mem, MEM_READ_WRITE);
|
||||||
|
mem_copy_to(task->tiles_mem);
|
||||||
|
|
||||||
|
cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
|
||||||
|
|
||||||
|
cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
|
||||||
|
|
||||||
|
kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
|
||||||
|
for(int i = 0; i < 9; i++) {
|
||||||
|
cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
|
||||||
|
kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
|
||||||
|
}
|
||||||
|
|
||||||
|
enqueue_kernel(ckFilterSetTiles, 1, 1);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
|
||||||
|
{
|
||||||
|
DenoisingTask denoising(this);
|
||||||
|
|
||||||
|
denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
|
||||||
|
denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
|
||||||
|
denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, _4, _5, &denoising);
|
||||||
|
denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
|
||||||
|
denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
|
||||||
|
denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
|
||||||
|
denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
|
||||||
|
|
||||||
|
denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
|
||||||
|
denoising.render_buffer.samples = rtile.sample;
|
||||||
|
|
||||||
|
RenderTile rtiles[9];
|
||||||
|
rtiles[4] = rtile;
|
||||||
|
task.map_neighbor_tiles(rtiles, this);
|
||||||
|
denoising.tiles_from_rendertiles(rtiles);
|
||||||
|
|
||||||
|
denoising.init_from_devicetask(task);
|
||||||
|
|
||||||
|
denoising.run_denoising();
|
||||||
|
|
||||||
|
task.unmap_neighbor_tiles(rtiles, this);
|
||||||
|
}
|
||||||
|
|
||||||
void OpenCLDeviceBase::shader(DeviceTask& task)
|
void OpenCLDeviceBase::shader(DeviceTask& task)
|
||||||
{
|
{
|
||||||
/* cast arguments to cl types */
|
/* cast arguments to cl types */
|
||||||
|
@@ -108,10 +108,11 @@ public:
|
|||||||
else if(task->type == DeviceTask::SHADER) {
|
else if(task->type == DeviceTask::SHADER) {
|
||||||
shader(*task);
|
shader(*task);
|
||||||
}
|
}
|
||||||
else if(task->type == DeviceTask::PATH_TRACE) {
|
else if(task->type == DeviceTask::RENDER) {
|
||||||
RenderTile tile;
|
RenderTile tile;
|
||||||
/* Keep rendering tiles until done. */
|
/* Keep rendering tiles until done. */
|
||||||
while(task->acquire_tile(this, tile)) {
|
while(task->acquire_tile(this, tile)) {
|
||||||
|
if(tile.task == RenderTile::PATH_TRACE) {
|
||||||
int start_sample = tile.start_sample;
|
int start_sample = tile.start_sample;
|
||||||
int end_sample = tile.start_sample + tile.num_samples;
|
int end_sample = tile.start_sample + tile.num_samples;
|
||||||
|
|
||||||
@@ -138,11 +139,22 @@ public:
|
|||||||
* next tile.
|
* next tile.
|
||||||
*/
|
*/
|
||||||
clFinish(cqCommandQueue);
|
clFinish(cqCommandQueue);
|
||||||
|
}
|
||||||
|
else if(tile.task == RenderTile::DENOISE) {
|
||||||
|
tile.sample = tile.start_sample + tile.num_samples;
|
||||||
|
denoise(tile, *task);
|
||||||
|
task->update_progress(&tile, tile.w*tile.h);
|
||||||
|
}
|
||||||
|
|
||||||
task->release_tile(tile);
|
task->release_tile(tile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool is_split_kernel()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
|
Device *opencl_create_mega_device(DeviceInfo& info, Stats& stats, bool background)
|
||||||
|
@@ -104,7 +104,7 @@ public:
|
|||||||
else if(task->type == DeviceTask::SHADER) {
|
else if(task->type == DeviceTask::SHADER) {
|
||||||
shader(*task);
|
shader(*task);
|
||||||
}
|
}
|
||||||
else if(task->type == DeviceTask::PATH_TRACE) {
|
else if(task->type == DeviceTask::RENDER) {
|
||||||
RenderTile tile;
|
RenderTile tile;
|
||||||
|
|
||||||
/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
|
/* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
|
||||||
@@ -127,6 +127,8 @@ public:
|
|||||||
|
|
||||||
/* Keep rendering tiles until done. */
|
/* Keep rendering tiles until done. */
|
||||||
while(task->acquire_tile(this, tile)) {
|
while(task->acquire_tile(this, tile)) {
|
||||||
|
if(tile.task == RenderTile::PATH_TRACE) {
|
||||||
|
assert(tile.task == RenderTile::PATH_TRACE);
|
||||||
split_kernel->path_trace(task,
|
split_kernel->path_trace(task,
|
||||||
tile,
|
tile,
|
||||||
kgbuffer,
|
kgbuffer,
|
||||||
@@ -142,6 +144,12 @@ public:
|
|||||||
* next tile.
|
* next tile.
|
||||||
*/
|
*/
|
||||||
clFinish(cqCommandQueue);
|
clFinish(cqCommandQueue);
|
||||||
|
}
|
||||||
|
else if(tile.task == RenderTile::DENOISE) {
|
||||||
|
tile.sample = tile.start_sample + tile.num_samples;
|
||||||
|
denoise(tile, *task);
|
||||||
|
task->update_progress(&tile, tile.w*tile.h);
|
||||||
|
}
|
||||||
|
|
||||||
task->release_tile(tile);
|
task->release_tile(tile);
|
||||||
}
|
}
|
||||||
@@ -150,6 +158,11 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool is_split_kernel()
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
/* ** Those guys are for workign around some compiler-specific bugs ** */
|
/* ** Those guys are for workign around some compiler-specific bugs ** */
|
||||||
|
|
||||||
|
@@ -1073,6 +1073,20 @@ string OpenCLInfo::get_readable_device_name(cl_device_id device_id)
|
|||||||
return get_device_name(device_id);
|
return get_device_name(device_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int OpenCLInfo::mem_address_alignment(cl_device_id device_id)
|
||||||
|
{
|
||||||
|
int base_align_bits;
|
||||||
|
if(clGetDeviceInfo(device_id,
|
||||||
|
CL_DEVICE_MEM_BASE_ADDR_ALIGN,
|
||||||
|
sizeof(int),
|
||||||
|
&base_align_bits,
|
||||||
|
NULL) == CL_SUCCESS)
|
||||||
|
{
|
||||||
|
return base_align_bits/8;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -10,7 +10,23 @@ set(INC_SYS
|
|||||||
|
|
||||||
set(SRC
|
set(SRC
|
||||||
kernels/cpu/kernel.cpp
|
kernels/cpu/kernel.cpp
|
||||||
|
kernels/cpu/kernel_sse2.cpp
|
||||||
|
kernels/cpu/kernel_sse3.cpp
|
||||||
|
kernels/cpu/kernel_sse41.cpp
|
||||||
|
kernels/cpu/kernel_avx.cpp
|
||||||
|
kernels/cpu/kernel_avx2.cpp
|
||||||
kernels/cpu/kernel_split.cpp
|
kernels/cpu/kernel_split.cpp
|
||||||
|
kernels/cpu/kernel_split_sse2.cpp
|
||||||
|
kernels/cpu/kernel_split_sse3.cpp
|
||||||
|
kernels/cpu/kernel_split_sse41.cpp
|
||||||
|
kernels/cpu/kernel_split_avx.cpp
|
||||||
|
kernels/cpu/kernel_split_avx2.cpp
|
||||||
|
kernels/cpu/filter.cpp
|
||||||
|
kernels/cpu/filter_sse2.cpp
|
||||||
|
kernels/cpu/filter_sse3.cpp
|
||||||
|
kernels/cpu/filter_sse41.cpp
|
||||||
|
kernels/cpu/filter_avx.cpp
|
||||||
|
kernels/cpu/filter_avx2.cpp
|
||||||
kernels/opencl/kernel.cl
|
kernels/opencl/kernel.cl
|
||||||
kernels/opencl/kernel_state_buffer_size.cl
|
kernels/opencl/kernel_state_buffer_size.cl
|
||||||
kernels/opencl/kernel_split.cl
|
kernels/opencl/kernel_split.cl
|
||||||
@@ -32,8 +48,10 @@ set(SRC
|
|||||||
kernels/opencl/kernel_next_iteration_setup.cl
|
kernels/opencl/kernel_next_iteration_setup.cl
|
||||||
kernels/opencl/kernel_indirect_subsurface.cl
|
kernels/opencl/kernel_indirect_subsurface.cl
|
||||||
kernels/opencl/kernel_buffer_update.cl
|
kernels/opencl/kernel_buffer_update.cl
|
||||||
|
kernels/opencl/filter.cl
|
||||||
kernels/cuda/kernel.cu
|
kernels/cuda/kernel.cu
|
||||||
kernels/cuda/kernel_split.cu
|
kernels/cuda/kernel_split.cu
|
||||||
|
kernels/cuda/filter.cu
|
||||||
)
|
)
|
||||||
|
|
||||||
set(SRC_BVH_HEADERS
|
set(SRC_BVH_HEADERS
|
||||||
@@ -95,6 +113,8 @@ set(SRC_KERNELS_CPU_HEADERS
|
|||||||
kernels/cpu/kernel_cpu.h
|
kernels/cpu/kernel_cpu.h
|
||||||
kernels/cpu/kernel_cpu_impl.h
|
kernels/cpu/kernel_cpu_impl.h
|
||||||
kernels/cpu/kernel_cpu_image.h
|
kernels/cpu/kernel_cpu_image.h
|
||||||
|
kernels/cpu/filter_cpu.h
|
||||||
|
kernels/cpu/filter_cpu_impl.h
|
||||||
)
|
)
|
||||||
|
|
||||||
set(SRC_KERNELS_CUDA_HEADERS
|
set(SRC_KERNELS_CUDA_HEADERS
|
||||||
@@ -190,6 +210,21 @@ set(SRC_GEOM_HEADERS
|
|||||||
geom/geom_volume.h
|
geom/geom_volume.h
|
||||||
)
|
)
|
||||||
|
|
||||||
|
set(SRC_FILTER_HEADERS
|
||||||
|
filter/filter.h
|
||||||
|
filter/filter_defines.h
|
||||||
|
filter/filter_features.h
|
||||||
|
filter/filter_features_sse.h
|
||||||
|
filter/filter_kernel.h
|
||||||
|
filter/filter_nlm_cpu.h
|
||||||
|
filter/filter_nlm_gpu.h
|
||||||
|
filter/filter_prefilter.h
|
||||||
|
filter/filter_reconstruction.h
|
||||||
|
filter/filter_transform.h
|
||||||
|
filter/filter_transform_gpu.h
|
||||||
|
filter/filter_transform_sse.h
|
||||||
|
)
|
||||||
|
|
||||||
set(SRC_UTIL_HEADERS
|
set(SRC_UTIL_HEADERS
|
||||||
../util/util_atomic.h
|
../util/util_atomic.h
|
||||||
../util/util_color.h
|
../util/util_color.h
|
||||||
@@ -204,6 +239,7 @@ set(SRC_UTIL_HEADERS
|
|||||||
../util/util_math_int2.h
|
../util/util_math_int2.h
|
||||||
../util/util_math_int3.h
|
../util/util_math_int3.h
|
||||||
../util/util_math_int4.h
|
../util/util_math_int4.h
|
||||||
|
../util/util_math_matrix.h
|
||||||
../util/util_static_assert.h
|
../util/util_static_assert.h
|
||||||
../util/util_transform.h
|
../util/util_transform.h
|
||||||
../util/util_texture.h
|
../util/util_texture.h
|
||||||
@@ -295,23 +331,21 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
|||||||
${SRC_CLOSURE_HEADERS}
|
${SRC_CLOSURE_HEADERS}
|
||||||
${SRC_UTIL_HEADERS}
|
${SRC_UTIL_HEADERS}
|
||||||
)
|
)
|
||||||
|
set(cuda_filter_sources kernels/cuda/filter.cu
|
||||||
|
${SRC_HEADERS}
|
||||||
|
${SRC_KERNELS_CUDA_HEADERS}
|
||||||
|
${SRC_FILTER_HEADERS}
|
||||||
|
${SRC_UTIL_HEADERS}
|
||||||
|
)
|
||||||
set(cuda_cubins)
|
set(cuda_cubins)
|
||||||
|
|
||||||
macro(CYCLES_CUDA_KERNEL_ADD arch split experimental)
|
macro(CYCLES_CUDA_KERNEL_ADD arch name flags sources experimental)
|
||||||
if(${split})
|
|
||||||
set(cuda_extra_flags "-D__SPLIT__")
|
|
||||||
set(cuda_cubin kernel_split)
|
|
||||||
else()
|
|
||||||
set(cuda_extra_flags "")
|
|
||||||
set(cuda_cubin kernel)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(${experimental})
|
if(${experimental})
|
||||||
set(cuda_extra_flags ${cuda_extra_flags} -D__KERNEL_EXPERIMENTAL__)
|
set(flags ${flags} -D__KERNEL_EXPERIMENTAL__)
|
||||||
set(cuda_cubin ${cuda_cubin}_experimental)
|
set(name ${name}_experimental)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(cuda_cubin ${cuda_cubin}_${arch}.cubin)
|
set(cuda_cubin ${name}_${arch}.cubin)
|
||||||
|
|
||||||
if(WITH_CYCLES_DEBUG)
|
if(WITH_CYCLES_DEBUG)
|
||||||
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
|
set(cuda_debug_flags "-D__KERNEL_DEBUG__")
|
||||||
@@ -325,11 +359,7 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
|||||||
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
|
set(cuda_version_flags "-D__KERNEL_CUDA_VERSION__=${cuda_nvcc_version}")
|
||||||
set(cuda_math_flags "--use_fast_math")
|
set(cuda_math_flags "--use_fast_math")
|
||||||
|
|
||||||
if(split)
|
set(cuda_kernel_src "/kernels/cuda/${name}.cu")
|
||||||
set(cuda_kernel_src "/kernels/cuda/kernel_split.cu")
|
|
||||||
else()
|
|
||||||
set(cuda_kernel_src "/kernels/cuda/kernel.cu")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${cuda_cubin}
|
OUTPUT ${cuda_cubin}
|
||||||
@@ -343,13 +373,13 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
|||||||
${cuda_arch_flags}
|
${cuda_arch_flags}
|
||||||
${cuda_version_flags}
|
${cuda_version_flags}
|
||||||
${cuda_math_flags}
|
${cuda_math_flags}
|
||||||
${cuda_extra_flags}
|
${flags}
|
||||||
${cuda_debug_flags}
|
${cuda_debug_flags}
|
||||||
-I${CMAKE_CURRENT_SOURCE_DIR}/..
|
-I${CMAKE_CURRENT_SOURCE_DIR}/..
|
||||||
-DCCL_NAMESPACE_BEGIN=
|
-DCCL_NAMESPACE_BEGIN=
|
||||||
-DCCL_NAMESPACE_END=
|
-DCCL_NAMESPACE_END=
|
||||||
-DNVCC
|
-DNVCC
|
||||||
DEPENDS ${cuda_sources})
|
DEPENDS ${sources})
|
||||||
|
|
||||||
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
|
delayed_install("${CMAKE_CURRENT_BINARY_DIR}" "${cuda_cubin}" ${CYCLES_INSTALL_PATH}/lib)
|
||||||
list(APPEND cuda_cubins ${cuda_cubin})
|
list(APPEND cuda_cubins ${cuda_cubin})
|
||||||
@@ -363,11 +393,12 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
|||||||
|
|
||||||
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
|
foreach(arch ${CYCLES_CUDA_BINARIES_ARCH})
|
||||||
# Compile regular kernel
|
# Compile regular kernel
|
||||||
CYCLES_CUDA_KERNEL_ADD(${arch} FALSE FALSE)
|
CYCLES_CUDA_KERNEL_ADD(${arch} kernel "" "${cuda_sources}" FALSE)
|
||||||
|
CYCLES_CUDA_KERNEL_ADD(${arch} filter "" "${cuda_filter_sources}" FALSE)
|
||||||
|
|
||||||
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
|
if(WITH_CYCLES_CUDA_SPLIT_KERNEL_BINARIES)
|
||||||
# Compile split kernel
|
# Compile split kernel
|
||||||
CYCLES_CUDA_KERNEL_ADD(${arch} TRUE FALSE)
|
CYCLES_CUDA_KERNEL_ADD(${arch} kernel_split "-D__SPLIT__" ${cuda_sources} FALSE)
|
||||||
endif()
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
@@ -388,41 +419,30 @@ include_directories(SYSTEM ${INC_SYS})
|
|||||||
|
|
||||||
set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
|
||||||
set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_split.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
|
||||||
|
set_source_files_properties(kernels/cpu/filter.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_KERNEL_FLAGS}")
|
||||||
|
|
||||||
if(CXX_HAS_SSE)
|
if(CXX_HAS_SSE)
|
||||||
list(APPEND SRC
|
|
||||||
kernels/cpu/kernel_sse2.cpp
|
|
||||||
kernels/cpu/kernel_sse3.cpp
|
|
||||||
kernels/cpu/kernel_sse41.cpp
|
|
||||||
kernels/cpu/kernel_split_sse2.cpp
|
|
||||||
kernels/cpu/kernel_split_sse3.cpp
|
|
||||||
kernels/cpu/kernel_split_sse41.cpp
|
|
||||||
)
|
|
||||||
|
|
||||||
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
||||||
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
||||||
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||||
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_split_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
||||||
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_split_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
||||||
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_split_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||||
|
set_source_files_properties(kernels/cpu/filter_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
|
||||||
|
set_source_files_properties(kernels/cpu/filter_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
|
||||||
|
set_source_files_properties(kernels/cpu/filter_sse41.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE41_KERNEL_FLAGS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(CXX_HAS_AVX)
|
if(CXX_HAS_AVX)
|
||||||
list(APPEND SRC
|
|
||||||
kernels/cpu/kernel_avx.cpp
|
|
||||||
kernels/cpu/kernel_split_avx.cpp
|
|
||||||
)
|
|
||||||
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
|
||||||
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_split_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
|
||||||
|
set_source_files_properties(kernels/cpu/filter_avx.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX_KERNEL_FLAGS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(CXX_HAS_AVX2)
|
if(CXX_HAS_AVX2)
|
||||||
list(APPEND SRC
|
|
||||||
kernels/cpu/kernel_avx2.cpp
|
|
||||||
kernels/cpu/kernel_split_avx2.cpp
|
|
||||||
)
|
|
||||||
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
||||||
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
set_source_files_properties(kernels/cpu/kernel_split_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
||||||
|
set_source_files_properties(kernels/cpu/filter_avx2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_AVX2_KERNEL_FLAGS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_library(cycles_kernel
|
add_library(cycles_kernel
|
||||||
@@ -432,6 +452,7 @@ add_library(cycles_kernel
|
|||||||
${SRC_KERNELS_CUDA_HEADERS}
|
${SRC_KERNELS_CUDA_HEADERS}
|
||||||
${SRC_BVH_HEADERS}
|
${SRC_BVH_HEADERS}
|
||||||
${SRC_CLOSURE_HEADERS}
|
${SRC_CLOSURE_HEADERS}
|
||||||
|
${SRC_FILTER_HEADERS}
|
||||||
${SRC_SVM_HEADERS}
|
${SRC_SVM_HEADERS}
|
||||||
${SRC_GEOM_HEADERS}
|
${SRC_GEOM_HEADERS}
|
||||||
${SRC_SPLIT_HEADERS}
|
${SRC_SPLIT_HEADERS}
|
||||||
@@ -472,12 +493,15 @@ delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_shadow_blocke
|
|||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_indirect_subsurface.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/kernel_buffer_update.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||||
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/opencl/filter.cl" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/opencl)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/kernel_split.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||||
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernels/cuda/filter.cu" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_KERNELS_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/kernels/cuda)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
|
||||||
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_FILTER_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/filter)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_SVM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/svm)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_GEOM_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/geom)
|
||||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
|
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_UTIL_HEADERS}" ${CYCLES_INSTALL_PATH}/source/util)
|
||||||
|
@@ -435,5 +435,23 @@ ccl_device bool bsdf_merge(ShaderClosure *a, ShaderClosure *b)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Classifies a closure as diffuse-like or specular-like.
|
||||||
|
* This is needed for the denoising feature pass generation,
|
||||||
|
* which are written on the first bounce where more than 25%
|
||||||
|
* of the sampling weight belongs to diffuse-line closures. */
|
||||||
|
ccl_device_inline bool bsdf_is_specular_like(ShaderClosure *sc)
|
||||||
|
{
|
||||||
|
if(CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(CLOSURE_IS_BSDF_MICROFACET(sc->type)) {
|
||||||
|
MicrofacetBsdf *bsdf = (MicrofacetBsdf*) sc;
|
||||||
|
return (bsdf->alpha_x*bsdf->alpha_y <= 0.075f*0.075f);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
|
@@ -40,7 +40,6 @@ typedef ccl_addr_space struct VelvetBsdf {
|
|||||||
|
|
||||||
float sigma;
|
float sigma;
|
||||||
float invsigma2;
|
float invsigma2;
|
||||||
float3 N;
|
|
||||||
} VelvetBsdf;
|
} VelvetBsdf;
|
||||||
|
|
||||||
ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
|
ccl_device int bsdf_ashikhmin_velvet_setup(VelvetBsdf *bsdf)
|
||||||
|
@@ -37,7 +37,6 @@ CCL_NAMESPACE_BEGIN
|
|||||||
|
|
||||||
typedef ccl_addr_space struct DiffuseBsdf {
|
typedef ccl_addr_space struct DiffuseBsdf {
|
||||||
SHADER_CLOSURE_BASE;
|
SHADER_CLOSURE_BASE;
|
||||||
float3 N;
|
|
||||||
} DiffuseBsdf;
|
} DiffuseBsdf;
|
||||||
|
|
||||||
/* DIFFUSE */
|
/* DIFFUSE */
|
||||||
|
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
|
|||||||
typedef ccl_addr_space struct DiffuseRampBsdf {
|
typedef ccl_addr_space struct DiffuseRampBsdf {
|
||||||
SHADER_CLOSURE_BASE;
|
SHADER_CLOSURE_BASE;
|
||||||
|
|
||||||
float3 N;
|
|
||||||
float3 *colors;
|
float3 *colors;
|
||||||
} DiffuseRampBsdf;
|
} DiffuseRampBsdf;
|
||||||
|
|
||||||
|
@@ -46,7 +46,6 @@ typedef ccl_addr_space struct MicrofacetBsdf {
|
|||||||
float alpha_x, alpha_y, ior;
|
float alpha_x, alpha_y, ior;
|
||||||
MicrofacetExtra *extra;
|
MicrofacetExtra *extra;
|
||||||
float3 T;
|
float3 T;
|
||||||
float3 N;
|
|
||||||
} MicrofacetBsdf;
|
} MicrofacetBsdf;
|
||||||
|
|
||||||
/* Beckmann and GGX microfacet importance sampling. */
|
/* Beckmann and GGX microfacet importance sampling. */
|
||||||
|
@@ -42,7 +42,7 @@ ccl_device_forceinline float D_ggx_aniso(const float3 wm, const float2 alpha)
|
|||||||
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
|
/* Sample slope distribution (based on page 14 of the supplemental implementation). */
|
||||||
ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
|
ccl_device_forceinline float2 mf_sampleP22_11(const float cosI, const float2 randU)
|
||||||
{
|
{
|
||||||
if(cosI > 0.9999f || cosI < 1e-6f) {
|
if(cosI > 0.9999f || fabsf(cosI) < 1e-6f) {
|
||||||
const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
|
const float r = sqrtf(randU.x / max(1.0f - randU.x, 1e-7f));
|
||||||
const float phi = M_2PI_F * randU.y;
|
const float phi = M_2PI_F * randU.y;
|
||||||
return make_float2(r*cosf(phi), r*sinf(phi));
|
return make_float2(r*cosf(phi), r*sinf(phi));
|
||||||
|
@@ -22,7 +22,6 @@ CCL_NAMESPACE_BEGIN
|
|||||||
typedef ccl_addr_space struct OrenNayarBsdf {
|
typedef ccl_addr_space struct OrenNayarBsdf {
|
||||||
SHADER_CLOSURE_BASE;
|
SHADER_CLOSURE_BASE;
|
||||||
|
|
||||||
float3 N;
|
|
||||||
float roughness;
|
float roughness;
|
||||||
float a;
|
float a;
|
||||||
float b;
|
float b;
|
||||||
|
@@ -40,7 +40,6 @@ CCL_NAMESPACE_BEGIN
|
|||||||
typedef ccl_addr_space struct PhongRampBsdf {
|
typedef ccl_addr_space struct PhongRampBsdf {
|
||||||
SHADER_CLOSURE_BASE;
|
SHADER_CLOSURE_BASE;
|
||||||
|
|
||||||
float3 N;
|
|
||||||
float exponent;
|
float exponent;
|
||||||
float3 *colors;
|
float3 *colors;
|
||||||
} PhongRampBsdf;
|
} PhongRampBsdf;
|
||||||
|
@@ -28,7 +28,6 @@ typedef ccl_addr_space struct PrincipledDiffuseBsdf {
|
|||||||
SHADER_CLOSURE_BASE;
|
SHADER_CLOSURE_BASE;
|
||||||
|
|
||||||
float roughness;
|
float roughness;
|
||||||
float3 N;
|
|
||||||
} PrincipledDiffuseBsdf;
|
} PrincipledDiffuseBsdf;
|
||||||
|
|
||||||
ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
|
ccl_device float3 calculate_principled_diffuse_brdf(const PrincipledDiffuseBsdf *bsdf,
|
||||||
|
@@ -26,7 +26,6 @@ CCL_NAMESPACE_BEGIN
|
|||||||
|
|
||||||
typedef ccl_addr_space struct PrincipledSheenBsdf {
|
typedef ccl_addr_space struct PrincipledSheenBsdf {
|
||||||
SHADER_CLOSURE_BASE;
|
SHADER_CLOSURE_BASE;
|
||||||
float3 N;
|
|
||||||
} PrincipledSheenBsdf;
|
} PrincipledSheenBsdf;
|
||||||
|
|
||||||
ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
|
ccl_device float3 calculate_principled_sheen_brdf(const PrincipledSheenBsdf *bsdf,
|
||||||
|
@@ -38,7 +38,6 @@ CCL_NAMESPACE_BEGIN
|
|||||||
typedef ccl_addr_space struct ToonBsdf {
|
typedef ccl_addr_space struct ToonBsdf {
|
||||||
SHADER_CLOSURE_BASE;
|
SHADER_CLOSURE_BASE;
|
||||||
|
|
||||||
float3 N;
|
|
||||||
float size;
|
float size;
|
||||||
float smooth;
|
float smooth;
|
||||||
} ToonBsdf;
|
} ToonBsdf;
|
||||||
|
@@ -28,7 +28,6 @@ typedef ccl_addr_space struct Bssrdf {
|
|||||||
float texture_blur;
|
float texture_blur;
|
||||||
float albedo;
|
float albedo;
|
||||||
float roughness;
|
float roughness;
|
||||||
float3 N;
|
|
||||||
} Bssrdf;
|
} Bssrdf;
|
||||||
|
|
||||||
/* Planar Truncated Gaussian
|
/* Planar Truncated Gaussian
|
||||||
|
52
intern/cycles/kernel/filter/filter.h
Normal file
52
intern/cycles/kernel/filter/filter.h
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __FILTER_H__
|
||||||
|
#define __FILTER_H__
|
||||||
|
|
||||||
|
/* CPU Filter Kernel Interface */
|
||||||
|
|
||||||
|
#include "util/util_types.h"
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_defines.h"
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
#define KERNEL_NAME_JOIN(x, y, z) x ## _ ## y ## _ ## z
|
||||||
|
#define KERNEL_NAME_EVAL(arch, name) KERNEL_NAME_JOIN(kernel, arch, name)
|
||||||
|
#define KERNEL_FUNCTION_FULL_NAME(name) KERNEL_NAME_EVAL(KERNEL_ARCH, name)
|
||||||
|
|
||||||
|
#define KERNEL_ARCH cpu
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||||
|
|
||||||
|
#define KERNEL_ARCH cpu_sse2
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||||
|
|
||||||
|
#define KERNEL_ARCH cpu_sse3
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||||
|
|
||||||
|
#define KERNEL_ARCH cpu_sse41
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||||
|
|
||||||
|
#define KERNEL_ARCH cpu_avx
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||||
|
|
||||||
|
#define KERNEL_ARCH cpu_avx2
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu.h"
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
|
#endif /* __FILTER_H__ */
|
38
intern/cycles/kernel/filter/filter_defines.h
Normal file
38
intern/cycles/kernel/filter/filter_defines.h
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __FILTER_DEFINES_H__
|
||||||
|
#define __FILTER_DEFINES_H__
|
||||||
|
|
||||||
|
#define DENOISE_FEATURES 10
|
||||||
|
#define TRANSFORM_SIZE (DENOISE_FEATURES*DENOISE_FEATURES)
|
||||||
|
#define XTWX_SIZE (((DENOISE_FEATURES+1)*(DENOISE_FEATURES+2))/2)
|
||||||
|
#define XTWY_SIZE (DENOISE_FEATURES+1)
|
||||||
|
|
||||||
|
typedef struct TilesInfo {
|
||||||
|
int offsets[9];
|
||||||
|
int strides[9];
|
||||||
|
int x[4];
|
||||||
|
int y[4];
|
||||||
|
/* TODO(lukas): CUDA doesn't have uint64_t... */
|
||||||
|
#ifdef __KERNEL_OPENCL__
|
||||||
|
ccl_global float *buffers[9];
|
||||||
|
#else
|
||||||
|
long long int buffers[9];
|
||||||
|
#endif
|
||||||
|
} TilesInfo;
|
||||||
|
|
||||||
|
#endif /* __FILTER_DEFINES_H__*/
|
120
intern/cycles/kernel/filter/filter_features.h
Normal file
120
intern/cycles/kernel/filter/filter_features.h
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride]
|
||||||
|
|
||||||
|
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y).
|
||||||
|
* pixel_buffer always points to the current pixel in the first pass. */
|
||||||
|
#define FOR_PIXEL_WINDOW pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
|
||||||
|
for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
|
||||||
|
for(pixel.x = low.x; pixel.x < high.x; pixel.x++, pixel_buffer++) {
|
||||||
|
|
||||||
|
#define END_FOR_PIXEL_WINDOW } \
|
||||||
|
pixel_buffer += buffer_w - (high.x - low.x); \
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *features, float ccl_restrict_ptr mean, int pass_stride)
|
||||||
|
{
|
||||||
|
features[0] = pixel.x;
|
||||||
|
features[1] = pixel.y;
|
||||||
|
features[2] = ccl_get_feature(buffer, 0);
|
||||||
|
features[3] = ccl_get_feature(buffer, 1);
|
||||||
|
features[4] = ccl_get_feature(buffer, 2);
|
||||||
|
features[5] = ccl_get_feature(buffer, 3);
|
||||||
|
features[6] = ccl_get_feature(buffer, 4);
|
||||||
|
features[7] = ccl_get_feature(buffer, 5);
|
||||||
|
features[8] = ccl_get_feature(buffer, 6);
|
||||||
|
features[9] = ccl_get_feature(buffer, 7);
|
||||||
|
if(mean) {
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++)
|
||||||
|
features[i] -= mean[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float ccl_restrict_ptr buffer, float *scales, float ccl_restrict_ptr mean, int pass_stride)
|
||||||
|
{
|
||||||
|
scales[0] = fabsf(pixel.x - mean[0]);
|
||||||
|
scales[1] = fabsf(pixel.y - mean[1]);
|
||||||
|
scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]);
|
||||||
|
scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3],
|
||||||
|
ccl_get_feature(buffer, 2) - mean[4],
|
||||||
|
ccl_get_feature(buffer, 3) - mean[5]));
|
||||||
|
scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]);
|
||||||
|
scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7],
|
||||||
|
ccl_get_feature(buffer, 6) - mean[8],
|
||||||
|
ccl_get_feature(buffer, 7) - mean[9]));
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void filter_calculate_scale(float *scale)
|
||||||
|
{
|
||||||
|
scale[0] = 1.0f/max(scale[0], 0.01f);
|
||||||
|
scale[1] = 1.0f/max(scale[1], 0.01f);
|
||||||
|
scale[2] = 1.0f/max(scale[2], 0.01f);
|
||||||
|
scale[6] = 1.0f/max(scale[4], 0.01f);
|
||||||
|
scale[7] = scale[8] = scale[9] = 1.0f/max(sqrtf(scale[5]), 0.01f);
|
||||||
|
scale[3] = scale[4] = scale[5] = 1.0f/max(sqrtf(scale[3]), 0.01f);
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
|
||||||
|
{
|
||||||
|
return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_restrict_ptr buffer, int pass_stride)
|
||||||
|
{
|
||||||
|
return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)));
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void design_row_add(float *design_row,
|
||||||
|
int rank,
|
||||||
|
ccl_global float ccl_restrict_ptr transform,
|
||||||
|
int stride,
|
||||||
|
int row,
|
||||||
|
float feature)
|
||||||
|
{
|
||||||
|
for(int i = 0; i < rank; i++) {
|
||||||
|
design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fill the design row. */
|
||||||
|
ccl_device_inline void filter_get_design_row_transform(int2 p_pixel,
|
||||||
|
ccl_global float ccl_restrict_ptr p_buffer,
|
||||||
|
int2 q_pixel,
|
||||||
|
ccl_global float ccl_restrict_ptr q_buffer,
|
||||||
|
int pass_stride,
|
||||||
|
int rank,
|
||||||
|
float *design_row,
|
||||||
|
ccl_global float ccl_restrict_ptr transform,
|
||||||
|
int stride)
|
||||||
|
{
|
||||||
|
design_row[0] = 1.0f;
|
||||||
|
math_vector_zero(design_row+1, rank);
|
||||||
|
design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x);
|
||||||
|
design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y);
|
||||||
|
design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0));
|
||||||
|
design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1));
|
||||||
|
design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2));
|
||||||
|
design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3));
|
||||||
|
design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4));
|
||||||
|
design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5));
|
||||||
|
design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6));
|
||||||
|
design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7));
|
||||||
|
}
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
95
intern/cycles/kernel/filter/filter_features_sse.h
Normal file
95
intern/cycles/kernel/filter/filter_features_sse.h
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
#define ccl_get_feature_sse(pass) _mm_loadu_ps(buffer + (pass)*pass_stride)
|
||||||
|
|
||||||
|
/* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y), 4 at a time.
|
||||||
|
* pixel_buffer always points to the first of the 4 current pixel in the first pass.
|
||||||
|
* x4 and y4 contain the coordinates of the four pixels, active_pixels contains a mask that's set for all pixels within the window. */
|
||||||
|
|
||||||
|
#define FOR_PIXEL_WINDOW_SSE pixel_buffer = buffer + (low.y - rect.y)*buffer_w + (low.x - rect.x); \
|
||||||
|
for(pixel.y = low.y; pixel.y < high.y; pixel.y++) { \
|
||||||
|
__m128 y4 = _mm_set1_ps(pixel.y); \
|
||||||
|
for(pixel.x = low.x; pixel.x < high.x; pixel.x += 4, pixel_buffer += 4) { \
|
||||||
|
__m128 x4 = _mm_add_ps(_mm_set1_ps(pixel.x), _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)); \
|
||||||
|
__m128 active_pixels = _mm_cmplt_ps(x4, _mm_set1_ps(high.x));
|
||||||
|
|
||||||
|
#define END_FOR_PIXEL_WINDOW_SSE } \
|
||||||
|
pixel_buffer += buffer_w - (pixel.x - low.x); \
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void filter_get_features_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *features, __m128 ccl_restrict_ptr mean, int pass_stride)
|
||||||
|
{
|
||||||
|
features[0] = x;
|
||||||
|
features[1] = y;
|
||||||
|
features[2] = ccl_get_feature_sse(0);
|
||||||
|
features[3] = ccl_get_feature_sse(1);
|
||||||
|
features[4] = ccl_get_feature_sse(2);
|
||||||
|
features[5] = ccl_get_feature_sse(3);
|
||||||
|
features[6] = ccl_get_feature_sse(4);
|
||||||
|
features[7] = ccl_get_feature_sse(5);
|
||||||
|
features[8] = ccl_get_feature_sse(6);
|
||||||
|
features[9] = ccl_get_feature_sse(7);
|
||||||
|
if(mean) {
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++)
|
||||||
|
features[i] = _mm_sub_ps(features[i], mean[i]);
|
||||||
|
}
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++)
|
||||||
|
features[i] = _mm_mask_ps(features[i], active_pixels);
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void filter_get_feature_scales_sse(__m128 x, __m128 y, __m128 active_pixels, float ccl_restrict_ptr buffer, __m128 *scales, __m128 ccl_restrict_ptr mean, int pass_stride)
|
||||||
|
{
|
||||||
|
scales[0] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(x, mean[0])), active_pixels);
|
||||||
|
scales[1] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(y, mean[1])), active_pixels);
|
||||||
|
|
||||||
|
scales[2] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(0), mean[2])), active_pixels);
|
||||||
|
|
||||||
|
__m128 diff, scale;
|
||||||
|
diff = _mm_sub_ps(ccl_get_feature_sse(1), mean[3]);
|
||||||
|
scale = _mm_mul_ps(diff, diff);
|
||||||
|
diff = _mm_sub_ps(ccl_get_feature_sse(2), mean[4]);
|
||||||
|
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
|
||||||
|
diff = _mm_sub_ps(ccl_get_feature_sse(3), mean[5]);
|
||||||
|
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
|
||||||
|
scales[3] = _mm_mask_ps(scale, active_pixels);
|
||||||
|
|
||||||
|
scales[4] = _mm_mask_ps(_mm_fabs_ps(_mm_sub_ps(ccl_get_feature_sse(4), mean[6])), active_pixels);
|
||||||
|
|
||||||
|
diff = _mm_sub_ps(ccl_get_feature_sse(5), mean[7]);
|
||||||
|
scale = _mm_mul_ps(diff, diff);
|
||||||
|
diff = _mm_sub_ps(ccl_get_feature_sse(6), mean[8]);
|
||||||
|
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
|
||||||
|
diff = _mm_sub_ps(ccl_get_feature_sse(7), mean[9]);
|
||||||
|
scale = _mm_add_ps(scale, _mm_mul_ps(diff, diff));
|
||||||
|
scales[5] = _mm_mask_ps(scale, active_pixels);
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void filter_calculate_scale_sse(__m128 *scale)
|
||||||
|
{
|
||||||
|
scale[0] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[0]), _mm_set1_ps(0.01f)));
|
||||||
|
scale[1] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[1]), _mm_set1_ps(0.01f)));
|
||||||
|
scale[2] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[2]), _mm_set1_ps(0.01f)));
|
||||||
|
scale[6] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(scale[4]), _mm_set1_ps(0.01f)));
|
||||||
|
|
||||||
|
scale[7] = scale[8] = scale[9] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[5])), _mm_set1_ps(0.01f)));
|
||||||
|
scale[3] = scale[4] = scale[5] = _mm_rcp_ps(_mm_max_ps(_mm_hmax_ps(_mm_sqrt_ps(scale[3])), _mm_set1_ps(0.01f)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
50
intern/cycles/kernel/filter/filter_kernel.h
Normal file
50
intern/cycles/kernel/filter/filter_kernel.h
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "util/util_color.h"
|
||||||
|
#include "util/util_math.h"
|
||||||
|
#include "util/util_math_fast.h"
|
||||||
|
#include "util/util_texture.h"
|
||||||
|
|
||||||
|
#include "util/util_atomic.h"
|
||||||
|
#include "util/util_math_matrix.h"
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_defines.h"
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_features.h"
|
||||||
|
#ifdef __KERNEL_SSE3__
|
||||||
|
# include "kernel/filter/filter_features_sse.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_prefilter.h"
|
||||||
|
|
||||||
|
#ifdef __KERNEL_GPU__
|
||||||
|
# include "kernel/filter/filter_transform_gpu.h"
|
||||||
|
#else
|
||||||
|
# ifdef __KERNEL_SSE3__
|
||||||
|
# include "kernel/filter/filter_transform_sse.h"
|
||||||
|
# else
|
||||||
|
# include "kernel/filter/filter_transform.h"
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_reconstruction.h"
|
||||||
|
|
||||||
|
#ifdef __KERNEL_CPU__
|
||||||
|
# include "kernel/filter/filter_nlm_cpu.h"
|
||||||
|
#else
|
||||||
|
# include "kernel/filter/filter_nlm_gpu.h"
|
||||||
|
#endif
|
163
intern/cycles/kernel/filter/filter_nlm_cpu.h
Normal file
163
intern/cycles/kernel/filter/filter_nlm_cpu.h
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy, float ccl_restrict_ptr weightImage, float ccl_restrict_ptr varianceImage, float *differenceImage, int4 rect, int w, int channel_offset, float a, float k_2)
|
||||||
|
{
|
||||||
|
for(int y = rect.y; y < rect.w; y++) {
|
||||||
|
for(int x = rect.x; x < rect.z; x++) {
|
||||||
|
float diff = 0.0f;
|
||||||
|
int numChannels = channel_offset? 3 : 1;
|
||||||
|
for(int c = 0; c < numChannels; c++) {
|
||||||
|
float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||||
|
float pvar = varianceImage[c*channel_offset + y*w+x];
|
||||||
|
float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||||
|
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
|
||||||
|
}
|
||||||
|
if(numChannels > 1) {
|
||||||
|
diff *= 1.0f/numChannels;
|
||||||
|
}
|
||||||
|
differenceImage[y*w+x] = diff;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
|
||||||
|
{
|
||||||
|
#ifdef __KERNEL_SSE3__
|
||||||
|
int aligned_lowx = (rect.x & ~(3));
|
||||||
|
int aligned_highx = ((rect.z + 3) & ~(3));
|
||||||
|
#endif
|
||||||
|
for(int y = rect.y; y < rect.w; y++) {
|
||||||
|
const int low = max(rect.y, y-f);
|
||||||
|
const int high = min(rect.w, y+f+1);
|
||||||
|
for(int x = rect.x; x < rect.z; x++) {
|
||||||
|
outImage[y*w+x] = 0.0f;
|
||||||
|
}
|
||||||
|
for(int y1 = low; y1 < high; y1++) {
|
||||||
|
#ifdef __KERNEL_SSE3__
|
||||||
|
for(int x = aligned_lowx; x < aligned_highx; x+=4) {
|
||||||
|
_mm_store_ps(outImage + y*w+x, _mm_add_ps(_mm_load_ps(outImage + y*w+x), _mm_load_ps(differenceImage + y1*w+x)));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
for(int x = rect.x; x < rect.z; x++) {
|
||||||
|
outImage[y*w+x] += differenceImage[y1*w+x];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
for(int x = rect.x; x < rect.z; x++) {
|
||||||
|
outImage[y*w+x] *= 1.0f/(high - low);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f)
|
||||||
|
{
|
||||||
|
for(int y = rect.y; y < rect.w; y++) {
|
||||||
|
for(int x = rect.x; x < rect.z; x++) {
|
||||||
|
outImage[y*w+x] = 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int dx = -f; dx <= f; dx++) {
|
||||||
|
int pos_dx = max(0, dx);
|
||||||
|
int neg_dx = min(0, dx);
|
||||||
|
for(int y = rect.y; y < rect.w; y++) {
|
||||||
|
for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
|
||||||
|
outImage[y*w+x] += differenceImage[y*w+dx+x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int y = rect.y; y < rect.w; y++) {
|
||||||
|
for(int x = rect.x; x < rect.z; x++) {
|
||||||
|
const int low = max(rect.x, x-f);
|
||||||
|
const int high = min(rect.z, x+f+1);
|
||||||
|
outImage[y*w+x] = expf(-max(outImage[y*w+x] * (1.0f/(high - low)), 0.0f));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy, float ccl_restrict_ptr differenceImage, float ccl_restrict_ptr image, float *outImage, float *accumImage, int4 rect, int w, int f)
|
||||||
|
{
|
||||||
|
for(int y = rect.y; y < rect.w; y++) {
|
||||||
|
for(int x = rect.x; x < rect.z; x++) {
|
||||||
|
const int low = max(rect.x, x-f);
|
||||||
|
const int high = min(rect.z, x+f+1);
|
||||||
|
float sum = 0.0f;
|
||||||
|
for(int x1 = low; x1 < high; x1++) {
|
||||||
|
sum += differenceImage[y*w+x1];
|
||||||
|
}
|
||||||
|
float weight = sum * (1.0f/(high - low));
|
||||||
|
accumImage[y*w+x] += weight;
|
||||||
|
outImage[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
|
||||||
|
float ccl_restrict_ptr differenceImage,
|
||||||
|
float ccl_restrict_ptr buffer,
|
||||||
|
float *color_pass,
|
||||||
|
float *variance_pass,
|
||||||
|
float *transform,
|
||||||
|
int *rank,
|
||||||
|
float *XtWX,
|
||||||
|
float3 *XtWY,
|
||||||
|
int4 rect,
|
||||||
|
int4 filter_rect,
|
||||||
|
int w, int h, int f,
|
||||||
|
int pass_stride)
|
||||||
|
{
|
||||||
|
/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
|
||||||
|
for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
|
||||||
|
int y = fy + filter_rect.y;
|
||||||
|
for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
|
||||||
|
int x = fx + filter_rect.x;
|
||||||
|
const int low = max(rect.x, x-f);
|
||||||
|
const int high = min(rect.z, x+f+1);
|
||||||
|
float sum = 0.0f;
|
||||||
|
for(int x1 = low; x1 < high; x1++) {
|
||||||
|
sum += differenceImage[y*w+x1];
|
||||||
|
}
|
||||||
|
float weight = sum * (1.0f/(high - low));
|
||||||
|
|
||||||
|
int storage_ofs = fy*filter_rect.z + fx;
|
||||||
|
float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
|
||||||
|
float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
|
||||||
|
float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
|
||||||
|
int *l_rank = rank + storage_ofs;
|
||||||
|
|
||||||
|
kernel_filter_construct_gramian(x, y, 1,
|
||||||
|
dx, dy, w, h,
|
||||||
|
pass_stride,
|
||||||
|
buffer,
|
||||||
|
color_pass, variance_pass,
|
||||||
|
l_transform, l_rank,
|
||||||
|
weight, l_XtWX, l_XtWY, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w)
|
||||||
|
{
|
||||||
|
for(int y = rect.y; y < rect.w; y++) {
|
||||||
|
for(int x = rect.x; x < rect.z; x++) {
|
||||||
|
outImage[y*w+x] /= accumImage[y*w+x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
147
intern/cycles/kernel/filter/filter_nlm_gpu.h
Normal file
147
intern/cycles/kernel/filter/filter_nlm_gpu.h
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
|
||||||
|
int dx, int dy,
|
||||||
|
ccl_global float ccl_restrict_ptr weightImage,
|
||||||
|
ccl_global float ccl_restrict_ptr varianceImage,
|
||||||
|
ccl_global float *differenceImage,
|
||||||
|
int4 rect, int w,
|
||||||
|
int channel_offset,
|
||||||
|
float a, float k_2)
|
||||||
|
{
|
||||||
|
float diff = 0.0f;
|
||||||
|
int numChannels = channel_offset? 3 : 1;
|
||||||
|
for(int c = 0; c < numChannels; c++) {
|
||||||
|
float cdiff = weightImage[c*channel_offset + y*w+x] - weightImage[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||||
|
float pvar = varianceImage[c*channel_offset + y*w+x];
|
||||||
|
float qvar = varianceImage[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||||
|
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
|
||||||
|
}
|
||||||
|
if(numChannels > 1) {
|
||||||
|
diff *= 1.0f/numChannels;
|
||||||
|
}
|
||||||
|
differenceImage[y*w+x] = diff;
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
|
||||||
|
ccl_global float ccl_restrict_ptr differenceImage,
|
||||||
|
ccl_global float *outImage,
|
||||||
|
int4 rect, int w, int f)
|
||||||
|
{
|
||||||
|
float sum = 0.0f;
|
||||||
|
const int low = max(rect.y, y-f);
|
||||||
|
const int high = min(rect.w, y+f+1);
|
||||||
|
for(int y1 = low; y1 < high; y1++) {
|
||||||
|
sum += differenceImage[y1*w+x];
|
||||||
|
}
|
||||||
|
sum *= 1.0f/(high-low);
|
||||||
|
outImage[y*w+x] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
|
||||||
|
ccl_global float ccl_restrict_ptr differenceImage,
|
||||||
|
ccl_global float *outImage,
|
||||||
|
int4 rect, int w, int f)
|
||||||
|
{
|
||||||
|
float sum = 0.0f;
|
||||||
|
const int low = max(rect.x, x-f);
|
||||||
|
const int high = min(rect.z, x+f+1);
|
||||||
|
for(int x1 = low; x1 < high; x1++) {
|
||||||
|
sum += differenceImage[y*w+x1];
|
||||||
|
}
|
||||||
|
sum *= 1.0f/(high-low);
|
||||||
|
outImage[y*w+x] = expf(-max(sum, 0.0f));
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
|
||||||
|
int dx, int dy,
|
||||||
|
ccl_global float ccl_restrict_ptr differenceImage,
|
||||||
|
ccl_global float ccl_restrict_ptr image,
|
||||||
|
ccl_global float *outImage,
|
||||||
|
ccl_global float *accumImage,
|
||||||
|
int4 rect, int w, int f)
|
||||||
|
{
|
||||||
|
float sum = 0.0f;
|
||||||
|
const int low = max(rect.x, x-f);
|
||||||
|
const int high = min(rect.z, x+f+1);
|
||||||
|
for(int x1 = low; x1 < high; x1++) {
|
||||||
|
sum += differenceImage[y*w+x1];
|
||||||
|
}
|
||||||
|
sum *= 1.0f/(high-low);
|
||||||
|
if(outImage) {
|
||||||
|
accumImage[y*w+x] += sum;
|
||||||
|
outImage[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
accumImage[y*w+x] = sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
|
||||||
|
int dx, int dy,
|
||||||
|
ccl_global float ccl_restrict_ptr differenceImage,
|
||||||
|
ccl_global float ccl_restrict_ptr buffer,
|
||||||
|
ccl_global float *color_pass,
|
||||||
|
ccl_global float *variance_pass,
|
||||||
|
ccl_global float ccl_restrict_ptr transform,
|
||||||
|
ccl_global int *rank,
|
||||||
|
ccl_global float *XtWX,
|
||||||
|
ccl_global float3 *XtWY,
|
||||||
|
int4 rect,
|
||||||
|
int4 filter_rect,
|
||||||
|
int w, int h, int f,
|
||||||
|
int pass_stride,
|
||||||
|
int localIdx)
|
||||||
|
{
|
||||||
|
int y = fy + filter_rect.y;
|
||||||
|
int x = fx + filter_rect.x;
|
||||||
|
const int low = max(rect.x, x-f);
|
||||||
|
const int high = min(rect.z, x+f+1);
|
||||||
|
float sum = 0.0f;
|
||||||
|
for(int x1 = low; x1 < high; x1++) {
|
||||||
|
sum += differenceImage[y*w+x1];
|
||||||
|
}
|
||||||
|
float weight = sum * (1.0f/(high - low));
|
||||||
|
|
||||||
|
int storage_ofs = fy*filter_rect.z + fx;
|
||||||
|
transform += storage_ofs;
|
||||||
|
rank += storage_ofs;
|
||||||
|
XtWX += storage_ofs;
|
||||||
|
XtWY += storage_ofs;
|
||||||
|
|
||||||
|
kernel_filter_construct_gramian(x, y,
|
||||||
|
filter_rect.z*filter_rect.w,
|
||||||
|
dx, dy, w, h,
|
||||||
|
pass_stride,
|
||||||
|
buffer,
|
||||||
|
color_pass, variance_pass,
|
||||||
|
transform, rank,
|
||||||
|
weight, XtWX, XtWY,
|
||||||
|
localIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
|
||||||
|
ccl_global float *outImage,
|
||||||
|
ccl_global float ccl_restrict_ptr accumImage,
|
||||||
|
int4 rect, int w)
|
||||||
|
{
|
||||||
|
outImage[y*w+x] /= accumImage[y*w+x];
|
||||||
|
}
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
145
intern/cycles/kernel/filter/filter_prefilter.h
Normal file
145
intern/cycles/kernel/filter/filter_prefilter.h
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
/* First step of the shadow prefiltering, performs the shadow division and stores all data
|
||||||
|
* in a nice and easy rectangular array that can be passed to the NLM filter.
|
||||||
|
*
|
||||||
|
* Calculates:
|
||||||
|
* unfiltered: Contains the two half images of the shadow feature pass
|
||||||
|
* sampleVariance: The sample-based variance calculated in the kernel. Note: This calculation is biased in general, and especially here since the variance of the ratio can only be approximated.
|
||||||
|
* sampleVarianceV: Variance of the sample variance estimation, quite noisy (since it's essentially the buffer variance of the two variance halves)
|
||||||
|
* bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
|
||||||
|
*/
|
||||||
|
ccl_device void kernel_filter_divide_shadow(int sample,
|
||||||
|
ccl_global TilesInfo *tiles,
|
||||||
|
int x, int y,
|
||||||
|
ccl_global float *unfilteredA,
|
||||||
|
ccl_global float *unfilteredB,
|
||||||
|
ccl_global float *sampleVariance,
|
||||||
|
ccl_global float *sampleVarianceV,
|
||||||
|
ccl_global float *bufferVariance,
|
||||||
|
int4 rect,
|
||||||
|
int buffer_pass_stride,
|
||||||
|
int buffer_denoising_offset,
|
||||||
|
bool use_split_variance)
|
||||||
|
{
|
||||||
|
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
|
||||||
|
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
|
||||||
|
int tile = ytile*3+xtile;
|
||||||
|
|
||||||
|
int offset = tiles->offsets[tile];
|
||||||
|
int stride = tiles->strides[tile];
|
||||||
|
ccl_global float ccl_restrict_ptr center_buffer = (ccl_global float*) tiles->buffers[tile];
|
||||||
|
center_buffer += (y*stride + x + offset)*buffer_pass_stride;
|
||||||
|
center_buffer += buffer_denoising_offset + 14;
|
||||||
|
|
||||||
|
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||||
|
int idx = (y-rect.y)*buffer_w + (x - rect.x);
|
||||||
|
unfilteredA[idx] = center_buffer[1] / max(center_buffer[0], 1e-7f);
|
||||||
|
unfilteredB[idx] = center_buffer[4] / max(center_buffer[3], 1e-7f);
|
||||||
|
|
||||||
|
float varA = center_buffer[2];
|
||||||
|
float varB = center_buffer[5];
|
||||||
|
int odd_sample = (sample+1)/2;
|
||||||
|
int even_sample = sample/2;
|
||||||
|
if(use_split_variance) {
|
||||||
|
varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
|
||||||
|
varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
|
||||||
|
}
|
||||||
|
varA /= (odd_sample - 1);
|
||||||
|
varB /= (even_sample - 1);
|
||||||
|
|
||||||
|
sampleVariance[idx] = 0.5f*(varA + varB) / sample;
|
||||||
|
sampleVarianceV[idx] = 0.5f * (varA - varB) * (varA - varB) / (sample*sample);
|
||||||
|
bufferVariance[idx] = 0.5f * (unfilteredA[idx] - unfilteredB[idx]) * (unfilteredA[idx] - unfilteredB[idx]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Load a regular feature from the render buffers into the denoise buffer.
|
||||||
|
* Parameters:
|
||||||
|
* - sample: The sample amount in the buffer, used to normalize the buffer.
|
||||||
|
* - m_offset, v_offset: Render Buffer Pass offsets of mean and variance of the feature.
|
||||||
|
* - x, y: Current pixel
|
||||||
|
* - mean, variance: Target denoise buffers.
|
||||||
|
* - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
|
||||||
|
*/
|
||||||
|
ccl_device void kernel_filter_get_feature(int sample,
|
||||||
|
ccl_global TilesInfo *tiles,
|
||||||
|
int m_offset, int v_offset,
|
||||||
|
int x, int y,
|
||||||
|
ccl_global float *mean,
|
||||||
|
ccl_global float *variance,
|
||||||
|
int4 rect, int buffer_pass_stride,
|
||||||
|
int buffer_denoising_offset,
|
||||||
|
bool use_split_variance)
|
||||||
|
{
|
||||||
|
int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
|
||||||
|
int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
|
||||||
|
int tile = ytile*3+xtile;
|
||||||
|
ccl_global float *center_buffer = ((ccl_global float*) tiles->buffers[tile]) + (tiles->offsets[tile] + y*tiles->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
|
||||||
|
|
||||||
|
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||||
|
int idx = (y-rect.y)*buffer_w + (x - rect.x);
|
||||||
|
|
||||||
|
mean[idx] = center_buffer[m_offset] / sample;
|
||||||
|
if(use_split_variance) {
|
||||||
|
variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Combine A/B buffers.
|
||||||
|
* Calculates the combined mean and the buffer variance. */
|
||||||
|
ccl_device void kernel_filter_combine_halves(int x, int y,
|
||||||
|
ccl_global float *mean,
|
||||||
|
ccl_global float *variance,
|
||||||
|
ccl_global float *a,
|
||||||
|
ccl_global float *b,
|
||||||
|
int4 rect, int r)
|
||||||
|
{
|
||||||
|
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||||
|
int idx = (y-rect.y)*buffer_w + (x - rect.x);
|
||||||
|
|
||||||
|
if(mean) mean[idx] = 0.5f * (a[idx]+b[idx]);
|
||||||
|
if(variance) {
|
||||||
|
if(r == 0) variance[idx] = 0.25f * (a[idx]-b[idx])*(a[idx]-b[idx]);
|
||||||
|
else {
|
||||||
|
variance[idx] = 0.0f;
|
||||||
|
float values[25];
|
||||||
|
int numValues = 0;
|
||||||
|
for(int py = max(y-r, rect.y); py < min(y+r+1, rect.w); py++) {
|
||||||
|
for(int px = max(x-r, rect.x); px < min(x+r+1, rect.z); px++) {
|
||||||
|
int pidx = (py-rect.y)*buffer_w + (px-rect.x);
|
||||||
|
values[numValues++] = 0.25f * (a[pidx]-b[pidx])*(a[pidx]-b[pidx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Insertion-sort the variances (fast enough for 25 elements). */
|
||||||
|
for(int i = 1; i < numValues; i++) {
|
||||||
|
float v = values[i];
|
||||||
|
int j;
|
||||||
|
for(j = i-1; j >= 0 && values[j] > v; j--)
|
||||||
|
values[j+1] = values[j];
|
||||||
|
values[j+1] = v;
|
||||||
|
}
|
||||||
|
variance[idx] = values[(7*numValues)/8];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
103
intern/cycles/kernel/filter/filter_reconstruction.h
Normal file
103
intern/cycles/kernel/filter/filter_reconstruction.h
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
|
||||||
|
int storage_stride,
|
||||||
|
int dx, int dy,
|
||||||
|
int w, int h,
|
||||||
|
int pass_stride,
|
||||||
|
ccl_global float ccl_restrict_ptr buffer,
|
||||||
|
ccl_global float *color_pass,
|
||||||
|
ccl_global float *variance_pass,
|
||||||
|
ccl_global float ccl_restrict_ptr transform,
|
||||||
|
ccl_global int *rank,
|
||||||
|
float weight,
|
||||||
|
ccl_global float *XtWX,
|
||||||
|
ccl_global float3 *XtWY,
|
||||||
|
int localIdx)
|
||||||
|
{
|
||||||
|
int p_offset = y *w + x;
|
||||||
|
int q_offset = (y+dy)*w + (x+dx);
|
||||||
|
|
||||||
|
#ifdef __KERNEL_CPU__
|
||||||
|
const int stride = 1;
|
||||||
|
(void)storage_stride;
|
||||||
|
(void)localIdx;
|
||||||
|
float design_row[DENOISE_FEATURES+1];
|
||||||
|
#elif defined(__KERNEL_CUDA__)
|
||||||
|
const int stride = storage_stride;
|
||||||
|
ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
|
||||||
|
ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
|
||||||
|
#else
|
||||||
|
const int stride = storage_stride;
|
||||||
|
float design_row[DENOISE_FEATURES+1];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
|
||||||
|
float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
|
||||||
|
|
||||||
|
float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
|
||||||
|
float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
|
||||||
|
|
||||||
|
if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
filter_get_design_row_transform(make_int2(x, y), buffer + p_offset,
|
||||||
|
make_int2(x+dx, y+dy), buffer + q_offset,
|
||||||
|
pass_stride, *rank, design_row, transform, stride);
|
||||||
|
|
||||||
|
math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
|
||||||
|
math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
|
||||||
|
ccl_global float *buffer,
|
||||||
|
ccl_global int *rank,
|
||||||
|
int storage_stride,
|
||||||
|
ccl_global float *XtWX,
|
||||||
|
ccl_global float3 *XtWY,
|
||||||
|
int4 buffer_params,
|
||||||
|
int sample)
|
||||||
|
{
|
||||||
|
#ifdef __KERNEL_CPU__
|
||||||
|
const int stride = 1;
|
||||||
|
(void)storage_stride;
|
||||||
|
#else
|
||||||
|
const int stride = storage_stride;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
|
||||||
|
|
||||||
|
float3 final_color = XtWY[0];
|
||||||
|
|
||||||
|
ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
|
||||||
|
final_color *= sample;
|
||||||
|
if(buffer_params.w) {
|
||||||
|
final_color.x += combined_buffer[buffer_params.w+0];
|
||||||
|
final_color.y += combined_buffer[buffer_params.w+1];
|
||||||
|
final_color.z += combined_buffer[buffer_params.w+2];
|
||||||
|
}
|
||||||
|
combined_buffer[0] = final_color.x;
|
||||||
|
combined_buffer[1] = final_color.y;
|
||||||
|
combined_buffer[2] = final_color.z;
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef STORAGE_TYPE
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
113
intern/cycles/kernel/filter/filter_transform.h
Normal file
113
intern/cycles/kernel/filter/filter_transform.h
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
|
||||||
|
int x, int y, int4 rect,
|
||||||
|
int pass_stride,
|
||||||
|
float *transform, int *rank,
|
||||||
|
int radius, float pca_threshold)
|
||||||
|
{
|
||||||
|
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||||
|
|
||||||
|
float features[DENOISE_FEATURES];
|
||||||
|
|
||||||
|
/* Temporary storage, used in different steps of the algorithm. */
|
||||||
|
float tempmatrix[DENOISE_FEATURES*DENOISE_FEATURES];
|
||||||
|
float tempvector[2*DENOISE_FEATURES];
|
||||||
|
float ccl_restrict_ptr pixel_buffer;
|
||||||
|
int2 pixel;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* === Calculate denoising window. === */
|
||||||
|
int2 low = make_int2(max(rect.x, x - radius),
|
||||||
|
max(rect.y, y - radius));
|
||||||
|
int2 high = make_int2(min(rect.z, x + radius + 1),
|
||||||
|
min(rect.w, y + radius + 1));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* === Shift feature passes to have mean 0. === */
|
||||||
|
float feature_means[DENOISE_FEATURES];
|
||||||
|
math_vector_zero(feature_means, DENOISE_FEATURES);
|
||||||
|
FOR_PIXEL_WINDOW {
|
||||||
|
filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
|
||||||
|
math_vector_add(feature_means, features, DENOISE_FEATURES);
|
||||||
|
} END_FOR_PIXEL_WINDOW
|
||||||
|
|
||||||
|
float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
|
||||||
|
math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
|
||||||
|
|
||||||
|
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
|
||||||
|
float *feature_scale = tempvector;
|
||||||
|
math_vector_zero(feature_scale, DENOISE_FEATURES);
|
||||||
|
|
||||||
|
FOR_PIXEL_WINDOW {
|
||||||
|
filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
|
||||||
|
math_vector_max(feature_scale, features, DENOISE_FEATURES);
|
||||||
|
} END_FOR_PIXEL_WINDOW
|
||||||
|
|
||||||
|
filter_calculate_scale(feature_scale);
|
||||||
|
|
||||||
|
|
||||||
|
/* === Generate the feature transformation. ===
|
||||||
|
* This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
|
||||||
|
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
|
||||||
|
float* feature_matrix = tempmatrix;
|
||||||
|
math_matrix_zero(feature_matrix, DENOISE_FEATURES);
|
||||||
|
FOR_PIXEL_WINDOW {
|
||||||
|
filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
|
||||||
|
math_vector_mul(features, feature_scale, DENOISE_FEATURES);
|
||||||
|
math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
|
||||||
|
} END_FOR_PIXEL_WINDOW
|
||||||
|
|
||||||
|
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
|
||||||
|
*rank = 0;
|
||||||
|
if(pca_threshold < 0.0f) {
|
||||||
|
float threshold_energy = 0.0f;
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||||
|
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
|
||||||
|
}
|
||||||
|
threshold_energy *= 1.0f - (-pca_threshold);
|
||||||
|
|
||||||
|
float reduced_energy = 0.0f;
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||||
|
if(i >= 2 && reduced_energy >= threshold_energy)
|
||||||
|
break;
|
||||||
|
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||||
|
reduced_energy += s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||||
|
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||||
|
if(i >= 2 && sqrtf(s) < pca_threshold)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Bake the feature scaling into the transformation matrix. */
|
||||||
|
for(int i = 0; i < (*rank); i++) {
|
||||||
|
math_vector_mul(transform + i*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES);
|
||||||
|
}
|
||||||
|
math_matrix_transpose(transform, DENOISE_FEATURES, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
117
intern/cycles/kernel/filter/filter_transform_gpu.h
Normal file
117
intern/cycles/kernel/filter/filter_transform_gpu.h
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
ccl_device void kernel_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
|
||||||
|
int x, int y, int4 rect,
|
||||||
|
int pass_stride,
|
||||||
|
ccl_global float *transform,
|
||||||
|
ccl_global int *rank,
|
||||||
|
int radius, float pca_threshold,
|
||||||
|
int transform_stride, int localIdx)
|
||||||
|
{
|
||||||
|
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||||
|
|
||||||
|
#ifdef __KERNEL_CUDA__
|
||||||
|
ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE];
|
||||||
|
ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES;
|
||||||
|
#else
|
||||||
|
float features[DENOISE_FEATURES];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* === Calculate denoising window. === */
|
||||||
|
int2 low = make_int2(max(rect.x, x - radius),
|
||||||
|
max(rect.y, y - radius));
|
||||||
|
int2 high = make_int2(min(rect.z, x + radius + 1),
|
||||||
|
min(rect.w, y + radius + 1));
|
||||||
|
ccl_global float ccl_restrict_ptr pixel_buffer;
|
||||||
|
int2 pixel;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* === Shift feature passes to have mean 0. === */
|
||||||
|
float feature_means[DENOISE_FEATURES];
|
||||||
|
math_vector_zero(feature_means, DENOISE_FEATURES);
|
||||||
|
FOR_PIXEL_WINDOW {
|
||||||
|
filter_get_features(pixel, pixel_buffer, features, NULL, pass_stride);
|
||||||
|
math_vector_add(feature_means, features, DENOISE_FEATURES);
|
||||||
|
} END_FOR_PIXEL_WINDOW
|
||||||
|
|
||||||
|
float pixel_scale = 1.0f / ((high.y - low.y) * (high.x - low.x));
|
||||||
|
math_vector_scale(feature_means, pixel_scale, DENOISE_FEATURES);
|
||||||
|
|
||||||
|
/* === Scale the shifted feature passes to a range of [-1; 1], will be baked into the transform later. === */
|
||||||
|
float feature_scale[DENOISE_FEATURES];
|
||||||
|
math_vector_zero(feature_scale, DENOISE_FEATURES);
|
||||||
|
|
||||||
|
FOR_PIXEL_WINDOW {
|
||||||
|
filter_get_feature_scales(pixel, pixel_buffer, features, feature_means, pass_stride);
|
||||||
|
math_vector_max(feature_scale, features, DENOISE_FEATURES);
|
||||||
|
} END_FOR_PIXEL_WINDOW
|
||||||
|
|
||||||
|
filter_calculate_scale(feature_scale);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* === Generate the feature transformation. ===
|
||||||
|
* This transformation maps the DENOISE_FEATURES-dimentional feature space to a reduced feature (r-feature) space
|
||||||
|
* which generally has fewer dimensions. This mainly helps to prevent overfitting. */
|
||||||
|
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
|
||||||
|
math_matrix_zero(feature_matrix, DENOISE_FEATURES);
|
||||||
|
FOR_PIXEL_WINDOW {
|
||||||
|
filter_get_features(pixel, pixel_buffer, features, feature_means, pass_stride);
|
||||||
|
math_vector_mul(features, feature_scale, DENOISE_FEATURES);
|
||||||
|
math_matrix_add_gramian(feature_matrix, DENOISE_FEATURES, features, 1.0f);
|
||||||
|
} END_FOR_PIXEL_WINDOW
|
||||||
|
|
||||||
|
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, transform_stride);
|
||||||
|
*rank = 0;
|
||||||
|
if(pca_threshold < 0.0f) {
|
||||||
|
float threshold_energy = 0.0f;
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||||
|
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
|
||||||
|
}
|
||||||
|
threshold_energy *= 1.0f - (-pca_threshold);
|
||||||
|
|
||||||
|
float reduced_energy = 0.0f;
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||||
|
if(i >= 2 && reduced_energy >= threshold_energy)
|
||||||
|
break;
|
||||||
|
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||||
|
reduced_energy += s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||||
|
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||||
|
if(i >= 2 && sqrtf(s) < pca_threshold)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
math_matrix_transpose(transform, DENOISE_FEATURES, transform_stride);
|
||||||
|
|
||||||
|
/* Bake the feature scaling into the transformation matrix. */
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||||
|
for(int j = 0; j < (*rank); j++) {
|
||||||
|
transform[i*DENOISE_FEATURES + j] *= feature_scale[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
110
intern/cycles/kernel/filter/filter_transform_sse.h
Normal file
110
intern/cycles/kernel/filter/filter_transform_sse.h
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
ccl_device void kernel_filter_construct_transform(float ccl_restrict_ptr buffer,
|
||||||
|
int x, int y, int4 rect,
|
||||||
|
int pass_stride,
|
||||||
|
float *transform, int *rank,
|
||||||
|
int radius, float pca_threshold)
|
||||||
|
{
|
||||||
|
int buffer_w = align_up(rect.z - rect.x, 4);
|
||||||
|
|
||||||
|
__m128 features[DENOISE_FEATURES];
|
||||||
|
float ccl_restrict_ptr pixel_buffer;
|
||||||
|
int2 pixel;
|
||||||
|
|
||||||
|
int2 low = make_int2(max(rect.x, x - radius),
|
||||||
|
max(rect.y, y - radius));
|
||||||
|
int2 high = make_int2(min(rect.z, x + radius + 1),
|
||||||
|
min(rect.w, y + radius + 1));
|
||||||
|
|
||||||
|
__m128 feature_means[DENOISE_FEATURES];
|
||||||
|
math_vector_zero_sse(feature_means, DENOISE_FEATURES);
|
||||||
|
FOR_PIXEL_WINDOW_SSE {
|
||||||
|
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, NULL, pass_stride);
|
||||||
|
math_vector_add_sse(feature_means, DENOISE_FEATURES, features);
|
||||||
|
} END_FOR_PIXEL_WINDOW_SSE
|
||||||
|
|
||||||
|
__m128 pixel_scale = _mm_set1_ps(1.0f / ((high.y - low.y) * (high.x - low.x)));
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||||
|
feature_means[i] = _mm_mul_ps(_mm_hsum_ps(feature_means[i]), pixel_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
__m128 feature_scale[DENOISE_FEATURES];
|
||||||
|
math_vector_zero_sse(feature_scale, DENOISE_FEATURES);
|
||||||
|
FOR_PIXEL_WINDOW_SSE {
|
||||||
|
filter_get_feature_scales_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
|
||||||
|
math_vector_max_sse(feature_scale, features, DENOISE_FEATURES);
|
||||||
|
} END_FOR_PIXEL_WINDOW_SSE
|
||||||
|
|
||||||
|
filter_calculate_scale_sse(feature_scale);
|
||||||
|
|
||||||
|
__m128 feature_matrix_sse[DENOISE_FEATURES*DENOISE_FEATURES];
|
||||||
|
math_matrix_zero_sse(feature_matrix_sse, DENOISE_FEATURES);
|
||||||
|
FOR_PIXEL_WINDOW_SSE {
|
||||||
|
filter_get_features_sse(x4, y4, active_pixels, pixel_buffer, features, feature_means, pass_stride);
|
||||||
|
math_vector_mul_sse(features, DENOISE_FEATURES, feature_scale);
|
||||||
|
math_matrix_add_gramian_sse(feature_matrix_sse, DENOISE_FEATURES, features, _mm_set1_ps(1.0f));
|
||||||
|
} END_FOR_PIXEL_WINDOW_SSE
|
||||||
|
|
||||||
|
float feature_matrix[DENOISE_FEATURES*DENOISE_FEATURES];
|
||||||
|
math_matrix_hsum(feature_matrix, DENOISE_FEATURES, feature_matrix_sse);
|
||||||
|
|
||||||
|
math_matrix_jacobi_eigendecomposition(feature_matrix, transform, DENOISE_FEATURES, 1);
|
||||||
|
|
||||||
|
*rank = 0;
|
||||||
|
if(pca_threshold < 0.0f) {
|
||||||
|
float threshold_energy = 0.0f;
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||||
|
threshold_energy += feature_matrix[i*DENOISE_FEATURES+i];
|
||||||
|
}
|
||||||
|
threshold_energy *= 1.0f - (-pca_threshold);
|
||||||
|
|
||||||
|
float reduced_energy = 0.0f;
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||||
|
if(i >= 2 && reduced_energy >= threshold_energy)
|
||||||
|
break;
|
||||||
|
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||||
|
reduced_energy += s;
|
||||||
|
/* Bake the feature scaling into the transformation matrix. */
|
||||||
|
for(int j = 0; j < DENOISE_FEATURES; j++) {
|
||||||
|
transform[(*rank)*DENOISE_FEATURES + j] *= _mm_cvtss_f32(feature_scale[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++, (*rank)++) {
|
||||||
|
float s = feature_matrix[i*DENOISE_FEATURES+i];
|
||||||
|
if(i >= 2 && sqrtf(s) < pca_threshold)
|
||||||
|
break;
|
||||||
|
/* Bake the feature scaling into the transformation matrix. */
|
||||||
|
for(int j = 0; j < DENOISE_FEATURES; j++) {
|
||||||
|
transform[(*rank)*DENOISE_FEATURES + j] *= _mm_cvtss_f32(feature_scale[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
math_matrix_transpose(transform, DENOISE_FEATURES, 1);
|
||||||
|
|
||||||
|
/* Bake the feature scaling into the transformation matrix. */
|
||||||
|
for(int i = 0; i < DENOISE_FEATURES; i++) {
|
||||||
|
math_vector_scale(transform + i*DENOISE_FEATURES, _mm_cvtss_f32(feature_scale[i]), *rank);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
@@ -76,7 +76,7 @@ ccl_device_inline void triangle_vertices(KernelGlobals *kg, int prim, float3 P[3
|
|||||||
|
|
||||||
/* Interpolate smooth vertex normal from vertices */
|
/* Interpolate smooth vertex normal from vertices */
|
||||||
|
|
||||||
ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, float u, float v)
|
ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, float3 Ng, int prim, float u, float v)
|
||||||
{
|
{
|
||||||
/* load triangle vertices */
|
/* load triangle vertices */
|
||||||
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
|
const uint4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
|
||||||
@@ -84,7 +84,9 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
|
|||||||
float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
|
float3 n1 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.y));
|
||||||
float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
|
float3 n2 = float4_to_float3(kernel_tex_fetch(__tri_vnormal, tri_vindex.z));
|
||||||
|
|
||||||
return normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
|
float3 N = safe_normalize((1.0f - u - v)*n2 + u*n0 + v*n1);
|
||||||
|
|
||||||
|
return is_zero(N)? Ng: N;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Ray differentials on triangle */
|
/* Ray differentials on triangle */
|
||||||
|
@@ -50,30 +50,20 @@ void kernel_tex_copy(KernelGlobals *kg,
|
|||||||
#define KERNEL_ARCH cpu
|
#define KERNEL_ARCH cpu
|
||||||
#include "kernel/kernels/cpu/kernel_cpu.h"
|
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
#define KERNEL_ARCH cpu_sse2
|
||||||
# define KERNEL_ARCH cpu_sse2
|
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||||
# include "kernel/kernels/cpu/kernel_cpu.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
#define KERNEL_ARCH cpu_sse3
|
||||||
# define KERNEL_ARCH cpu_sse3
|
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||||
# include "kernel/kernels/cpu/kernel_cpu.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
#define KERNEL_ARCH cpu_sse41
|
||||||
# define KERNEL_ARCH cpu_sse41
|
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||||
# include "kernel/kernels/cpu/kernel_cpu.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
#define KERNEL_ARCH cpu_avx
|
||||||
# define KERNEL_ARCH cpu_avx
|
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||||
# include "kernel/kernels/cpu/kernel_cpu.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
#define KERNEL_ARCH cpu_avx2
|
||||||
# define KERNEL_ARCH cpu_avx2
|
#include "kernel/kernels/cpu/kernel_cpu.h"
|
||||||
# include "kernel/kernels/cpu/kernel_cpu.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
|
@@ -222,6 +222,12 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
|
|||||||
L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
|
L->path_total_shaded = make_float3(0.0f, 0.0f, 0.0f);
|
||||||
L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
|
L->shadow_color = make_float3(0.0f, 0.0f, 0.0f);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
L->denoising_normal = make_float3(0.0f, 0.0f, 0.0f);
|
||||||
|
L->denoising_albedo = make_float3(0.0f, 0.0f, 0.0f);
|
||||||
|
L->denoising_depth = 0.0f;
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
}
|
}
|
||||||
|
|
||||||
ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
|
ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
|
||||||
@@ -277,15 +283,15 @@ ccl_device_inline void path_radiance_accum_emission(PathRadiance *L, float3 thro
|
|||||||
}
|
}
|
||||||
|
|
||||||
ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
|
ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
|
||||||
|
ccl_addr_space PathState *state,
|
||||||
float3 throughput,
|
float3 throughput,
|
||||||
float3 alpha,
|
float3 alpha,
|
||||||
float3 bsdf,
|
float3 bsdf,
|
||||||
float3 ao,
|
float3 ao)
|
||||||
int bounce)
|
|
||||||
{
|
{
|
||||||
#ifdef __PASSES__
|
#ifdef __PASSES__
|
||||||
if(L->use_light_pass) {
|
if(L->use_light_pass) {
|
||||||
if(bounce == 0) {
|
if(state->bounce == 0) {
|
||||||
/* directly visible lighting */
|
/* directly visible lighting */
|
||||||
L->direct_diffuse += throughput*bsdf*ao;
|
L->direct_diffuse += throughput*bsdf*ao;
|
||||||
L->ao += alpha*throughput*ao;
|
L->ao += alpha*throughput*ao;
|
||||||
@@ -302,31 +308,43 @@ ccl_device_inline void path_radiance_accum_ao(PathRadiance *L,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __SHADOW_TRICKS__
|
#ifdef __SHADOW_TRICKS__
|
||||||
|
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
|
||||||
float3 light = throughput * bsdf;
|
float3 light = throughput * bsdf;
|
||||||
L->path_total += light;
|
L->path_total += light;
|
||||||
L->path_total_shaded += ao * light;
|
L->path_total_shaded += ao * light;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ccl_device_inline void path_radiance_accum_total_ao(
|
ccl_device_inline void path_radiance_accum_total_ao(
|
||||||
PathRadiance *L,
|
PathRadiance *L,
|
||||||
|
ccl_addr_space PathState *state,
|
||||||
float3 throughput,
|
float3 throughput,
|
||||||
float3 bsdf)
|
float3 bsdf)
|
||||||
{
|
{
|
||||||
#ifdef __SHADOW_TRICKS__
|
#ifdef __SHADOW_TRICKS__
|
||||||
|
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
|
||||||
L->path_total += throughput * bsdf;
|
L->path_total += throughput * bsdf;
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
(void) L;
|
(void) L;
|
||||||
|
(void) state;
|
||||||
(void) throughput;
|
(void) throughput;
|
||||||
(void) bsdf;
|
(void) bsdf;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 throughput, BsdfEval *bsdf_eval, float3 shadow, float shadow_fac, int bounce, bool is_lamp)
|
ccl_device_inline void path_radiance_accum_light(PathRadiance *L,
|
||||||
|
ccl_addr_space PathState *state,
|
||||||
|
float3 throughput,
|
||||||
|
BsdfEval *bsdf_eval,
|
||||||
|
float3 shadow,
|
||||||
|
float shadow_fac,
|
||||||
|
bool is_lamp)
|
||||||
{
|
{
|
||||||
#ifdef __PASSES__
|
#ifdef __PASSES__
|
||||||
if(L->use_light_pass) {
|
if(L->use_light_pass) {
|
||||||
if(bounce == 0) {
|
if(state->bounce == 0) {
|
||||||
/* directly visible lighting */
|
/* directly visible lighting */
|
||||||
L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
|
L->direct_diffuse += throughput*bsdf_eval->diffuse*shadow;
|
||||||
L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
|
L->direct_glossy += throughput*bsdf_eval->glossy*shadow;
|
||||||
@@ -352,21 +370,27 @@ ccl_device_inline void path_radiance_accum_light(PathRadiance *L, float3 through
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __SHADOW_TRICKS__
|
#ifdef __SHADOW_TRICKS__
|
||||||
|
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
|
||||||
float3 light = throughput * bsdf_eval->sum_no_mis;
|
float3 light = throughput * bsdf_eval->sum_no_mis;
|
||||||
L->path_total += light;
|
L->path_total += light;
|
||||||
L->path_total_shaded += shadow * light;
|
L->path_total_shaded += shadow * light;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
ccl_device_inline void path_radiance_accum_total_light(
|
ccl_device_inline void path_radiance_accum_total_light(
|
||||||
PathRadiance *L,
|
PathRadiance *L,
|
||||||
|
ccl_addr_space PathState *state,
|
||||||
float3 throughput,
|
float3 throughput,
|
||||||
const BsdfEval *bsdf_eval)
|
const BsdfEval *bsdf_eval)
|
||||||
{
|
{
|
||||||
#ifdef __SHADOW_TRICKS__
|
#ifdef __SHADOW_TRICKS__
|
||||||
|
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
|
||||||
L->path_total += throughput * bsdf_eval->sum_no_mis;
|
L->path_total += throughput * bsdf_eval->sum_no_mis;
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
(void) L;
|
(void) L;
|
||||||
|
(void) state;
|
||||||
(void) throughput;
|
(void) throughput;
|
||||||
(void) bsdf_eval;
|
(void) bsdf_eval;
|
||||||
#endif
|
#endif
|
||||||
@@ -393,11 +417,17 @@ ccl_device_inline void path_radiance_accum_background(PathRadiance *L,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __SHADOW_TRICKS__
|
#ifdef __SHADOW_TRICKS__
|
||||||
|
if(state->flag & PATH_RAY_STORE_SHADOW_INFO) {
|
||||||
L->path_total += throughput * value;
|
L->path_total += throughput * value;
|
||||||
if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
|
if(state->flag & PATH_RAY_SHADOW_CATCHER_ONLY) {
|
||||||
L->path_total_shaded += throughput * value;
|
L->path_total_shaded += throughput * value;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
L->denoising_albedo += state->denoising_feature_weight * value;
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
}
|
}
|
||||||
|
|
||||||
ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
|
ccl_device_inline void path_radiance_sum_indirect(PathRadiance *L)
|
||||||
@@ -555,6 +585,38 @@ ccl_device_inline float3 path_radiance_clamp_and_sum(KernelGlobals *kg, PathRadi
|
|||||||
return L_sum;
|
return L_sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void path_radiance_split_denoising(KernelGlobals *kg, PathRadiance *L, float3 *noisy, float3 *clean)
|
||||||
|
{
|
||||||
|
#ifdef __PASSES__
|
||||||
|
kernel_assert(L->use_light_pass);
|
||||||
|
|
||||||
|
*clean = L->emission + L->background;
|
||||||
|
*noisy = L->direct_scatter + L->indirect_scatter;
|
||||||
|
|
||||||
|
# define ADD_COMPONENT(flag, component) \
|
||||||
|
if(kernel_data.film.denoising_flags & flag) \
|
||||||
|
*clean += component; \
|
||||||
|
else \
|
||||||
|
*noisy += component;
|
||||||
|
|
||||||
|
ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_DIR, L->direct_diffuse);
|
||||||
|
ADD_COMPONENT(DENOISING_CLEAN_DIFFUSE_IND, L->indirect_diffuse);
|
||||||
|
ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_DIR, L->direct_glossy);
|
||||||
|
ADD_COMPONENT(DENOISING_CLEAN_GLOSSY_IND, L->indirect_glossy);
|
||||||
|
ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_DIR, L->direct_transmission);
|
||||||
|
ADD_COMPONENT(DENOISING_CLEAN_TRANSMISSION_IND, L->indirect_transmission);
|
||||||
|
ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_DIR, L->direct_subsurface);
|
||||||
|
ADD_COMPONENT(DENOISING_CLEAN_SUBSURFACE_IND, L->indirect_subsurface);
|
||||||
|
# undef ADD_COMPONENT
|
||||||
|
#else
|
||||||
|
*noisy = L->emission;
|
||||||
|
*clean = make_float3(0.0f, 0.0f, 0.0f);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
*noisy = ensure_finite3(*noisy);
|
||||||
|
*clean = ensure_finite3(*clean);
|
||||||
|
}
|
||||||
|
|
||||||
ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
|
ccl_device_inline void path_radiance_accum_sample(PathRadiance *L, PathRadiance *L_sample, int num_samples)
|
||||||
{
|
{
|
||||||
float fac = 1.0f/num_samples;
|
float fac = 1.0f/num_samples;
|
||||||
@@ -595,12 +657,12 @@ ccl_device_inline float path_radiance_sum_shadow(const PathRadiance *L)
|
|||||||
/* Calculate final light sum and transparency for shadow catcher object. */
|
/* Calculate final light sum and transparency for shadow catcher object. */
|
||||||
ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
|
ccl_device_inline float3 path_radiance_sum_shadowcatcher(KernelGlobals *kg,
|
||||||
const PathRadiance *L,
|
const PathRadiance *L,
|
||||||
ccl_addr_space float* L_transparent)
|
float* alpha)
|
||||||
{
|
{
|
||||||
const float shadow = path_radiance_sum_shadow(L);
|
const float shadow = path_radiance_sum_shadow(L);
|
||||||
float3 L_sum;
|
float3 L_sum;
|
||||||
if(kernel_data.background.transparent) {
|
if(kernel_data.background.transparent) {
|
||||||
*L_transparent = shadow;
|
*alpha = 1.0f-shadow;
|
||||||
L_sum = make_float3(0.0f, 0.0f, 0.0f);
|
L_sum = make_float3(0.0f, 0.0f, 0.0f);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@@ -42,6 +42,8 @@
|
|||||||
#include "util/util_types.h"
|
#include "util/util_types.h"
|
||||||
#include "util/util_texture.h"
|
#include "util/util_texture.h"
|
||||||
|
|
||||||
|
#define ccl_restrict_ptr const * __restrict
|
||||||
|
|
||||||
#define ccl_addr_space
|
#define ccl_addr_space
|
||||||
|
|
||||||
#define ccl_local_id(d) 0
|
#define ccl_local_id(d) 0
|
||||||
|
@@ -55,6 +55,10 @@
|
|||||||
#define ccl_restrict __restrict__
|
#define ccl_restrict __restrict__
|
||||||
#define ccl_align(n) __align__(n)
|
#define ccl_align(n) __align__(n)
|
||||||
|
|
||||||
|
#define ccl_restrict_ptr const * __restrict__
|
||||||
|
#define CCL_MAX_LOCAL_SIZE (CUDA_THREADS_BLOCK_WIDTH*CUDA_THREADS_BLOCK_WIDTH)
|
||||||
|
|
||||||
|
|
||||||
/* No assert supported for CUDA */
|
/* No assert supported for CUDA */
|
||||||
|
|
||||||
#define kernel_assert(cond)
|
#define kernel_assert(cond)
|
||||||
|
@@ -50,6 +50,8 @@
|
|||||||
# define ccl_addr_space
|
# define ccl_addr_space
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define ccl_restrict_ptr const * __restrict__
|
||||||
|
|
||||||
#define ccl_local_id(d) get_local_id(d)
|
#define ccl_local_id(d) get_local_id(d)
|
||||||
#define ccl_global_id(d) get_global_id(d)
|
#define ccl_global_id(d) get_global_id(d)
|
||||||
|
|
||||||
|
@@ -102,7 +102,7 @@ ccl_device_inline float area_light_sample(float3 P,
|
|||||||
float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
|
float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
|
||||||
cu = clamp(cu, -1.0f, 1.0f);
|
cu = clamp(cu, -1.0f, 1.0f);
|
||||||
/* Compute xu. */
|
/* Compute xu. */
|
||||||
float xu = -(cu * z0) / sqrtf(1.0f - cu * cu);
|
float xu = -(cu * z0) / max(sqrtf(1.0f - cu * cu), 1e-7f);
|
||||||
xu = clamp(xu, x0, x1);
|
xu = clamp(xu, x0, x1);
|
||||||
/* Compute yv. */
|
/* Compute yv. */
|
||||||
float z0sq = z0 * z0;
|
float z0sq = z0 * z0;
|
||||||
|
@@ -60,6 +60,135 @@ ccl_device_inline void kernel_write_pass_float4(ccl_global float *buffer, int sa
|
|||||||
#endif /* __SPLIT_KERNEL__ */
|
#endif /* __SPLIT_KERNEL__ */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer, int sample, float value)
|
||||||
|
{
|
||||||
|
kernel_write_pass_float(buffer, sample, value);
|
||||||
|
|
||||||
|
/* The online one-pass variance update that's used for the megakernel can't easily be implemented
|
||||||
|
* with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
|
||||||
|
# ifdef __SPLIT_KERNEL__
|
||||||
|
kernel_write_pass_float(buffer+1, sample, value*value);
|
||||||
|
# else
|
||||||
|
if(sample == 0) {
|
||||||
|
kernel_write_pass_float(buffer+1, sample, 0.0f);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
float new_mean = buffer[0] * (1.0f / (sample + 1));
|
||||||
|
float old_mean = (buffer[0] - value) * (1.0f / sample);
|
||||||
|
kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
}
|
||||||
|
|
||||||
|
# if defined(__SPLIT_KERNEL__)
|
||||||
|
# define kernel_write_pass_float3_unaligned kernel_write_pass_float3
|
||||||
|
# else
|
||||||
|
ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buffer, int sample, float3 value)
|
||||||
|
{
|
||||||
|
buffer[0] = (sample == 0)? value.x: buffer[0] + value.x;
|
||||||
|
buffer[1] = (sample == 0)? value.y: buffer[1] + value.y;
|
||||||
|
buffer[2] = (sample == 0)? value.z: buffer[2] + value.z;
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
|
||||||
|
{
|
||||||
|
kernel_write_pass_float3_unaligned(buffer, sample, value);
|
||||||
|
# ifdef __SPLIT_KERNEL__
|
||||||
|
kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
|
||||||
|
# else
|
||||||
|
if(sample == 0) {
|
||||||
|
kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
|
||||||
|
float3 new_mean = sum * (1.0f / (sample + 1));
|
||||||
|
float3 old_mean = (sum - value) * (1.0f / sample);
|
||||||
|
kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
|
||||||
|
int sample, float path_total, float path_total_shaded)
|
||||||
|
{
|
||||||
|
if(kernel_data.film.pass_denoising_data == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
buffer += (sample & 1)? DENOISING_PASS_SHADOW_B : DENOISING_PASS_SHADOW_A;
|
||||||
|
|
||||||
|
path_total = ensure_finite(path_total);
|
||||||
|
path_total_shaded = ensure_finite(path_total_shaded);
|
||||||
|
|
||||||
|
kernel_write_pass_float(buffer, sample/2, path_total);
|
||||||
|
kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
|
||||||
|
|
||||||
|
float value = path_total_shaded / max(path_total, 1e-7f);
|
||||||
|
# ifdef __SPLIT_KERNEL__
|
||||||
|
kernel_write_pass_float(buffer+2, sample/2, value*value);
|
||||||
|
# else
|
||||||
|
if(sample < 2) {
|
||||||
|
kernel_write_pass_float(buffer+2, sample/2, 0.0f);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
|
||||||
|
float new_value = buffer[1] / max(buffer[0], 1e-7f);
|
||||||
|
kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
}
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_update_denoising_features(KernelGlobals *kg,
|
||||||
|
ShaderData *sd,
|
||||||
|
ccl_global PathState *state,
|
||||||
|
PathRadiance *L)
|
||||||
|
{
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
if(state->denoising_feature_weight == 0.0f) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
L->denoising_depth += ensure_finite(state->denoising_feature_weight * sd->ray_length);
|
||||||
|
|
||||||
|
float3 normal = make_float3(0.0f, 0.0f, 0.0f);
|
||||||
|
float3 albedo = make_float3(0.0f, 0.0f, 0.0f);
|
||||||
|
float sum_weight = 0.0f, sum_nonspecular_weight = 0.0f;
|
||||||
|
|
||||||
|
for(int i = 0; i < sd->num_closure; i++) {
|
||||||
|
ShaderClosure *sc = &sd->closure[i];
|
||||||
|
|
||||||
|
if(!CLOSURE_IS_BSDF_OR_BSSRDF(sc->type))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* All closures contribute to the normal feature, but only diffuse-like ones to the albedo. */
|
||||||
|
normal += sc->N * sc->sample_weight;
|
||||||
|
sum_weight += sc->sample_weight;
|
||||||
|
if(!bsdf_is_specular_like(sc)) {
|
||||||
|
albedo += sc->weight;
|
||||||
|
sum_nonspecular_weight += sc->sample_weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Wait for next bounce if 75% or more sample weight belongs to specular-like closures. */
|
||||||
|
if((sum_weight == 0.0f) || (sum_nonspecular_weight*4.0f > sum_weight)) {
|
||||||
|
if(sum_weight != 0.0f) {
|
||||||
|
normal /= sum_weight;
|
||||||
|
}
|
||||||
|
L->denoising_normal += ensure_finite3(state->denoising_feature_weight * normal);
|
||||||
|
L->denoising_albedo += ensure_finite3(state->denoising_feature_weight * albedo);
|
||||||
|
|
||||||
|
state->denoising_feature_weight = 0.0f;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
(void) kg;
|
||||||
|
(void) sd;
|
||||||
|
(void) state;
|
||||||
|
(void) L;
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
|
}
|
||||||
|
|
||||||
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
|
ccl_device_inline void kernel_write_data_passes(KernelGlobals *kg, ccl_global float *buffer, PathRadiance *L,
|
||||||
ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
|
ShaderData *sd, int sample, ccl_addr_space PathState *state, float3 throughput)
|
||||||
{
|
{
|
||||||
@@ -199,5 +328,79 @@ ccl_device_inline void kernel_write_light_passes(KernelGlobals *kg, ccl_global f
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ccl_device_inline void kernel_write_result(KernelGlobals *kg, ccl_global float *buffer,
|
||||||
|
int sample, PathRadiance *L, float alpha, bool is_shadow_catcher)
|
||||||
|
{
|
||||||
|
if(L) {
|
||||||
|
float3 L_sum;
|
||||||
|
#ifdef __SHADOW_TRICKS__
|
||||||
|
if(is_shadow_catcher) {
|
||||||
|
L_sum = path_radiance_sum_shadowcatcher(kg, L, &alpha);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif /* __SHADOW_TRICKS__ */
|
||||||
|
{
|
||||||
|
L_sum = path_radiance_clamp_and_sum(kg, L);
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel_write_pass_float4(buffer, sample, make_float4(L_sum.x, L_sum.y, L_sum.z, alpha));
|
||||||
|
|
||||||
|
kernel_write_light_passes(kg, buffer, L, sample);
|
||||||
|
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
if(kernel_data.film.pass_denoising_data) {
|
||||||
|
# ifdef __SHADOW_TRICKS__
|
||||||
|
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, average(L->path_total), average(L->path_total_shaded));
|
||||||
|
# else
|
||||||
|
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
|
||||||
|
# endif
|
||||||
|
if(kernel_data.film.pass_denoising_clean) {
|
||||||
|
float3 noisy, clean;
|
||||||
|
path_radiance_split_denoising(kg, L, &noisy, &clean);
|
||||||
|
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
|
||||||
|
sample, noisy);
|
||||||
|
kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
|
||||||
|
sample, clean);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
|
||||||
|
sample, L_sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
|
||||||
|
sample, L->denoising_normal);
|
||||||
|
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
|
||||||
|
sample, L->denoising_albedo);
|
||||||
|
kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
|
||||||
|
sample, L->denoising_depth);
|
||||||
|
}
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
kernel_write_pass_float4(buffer, sample, make_float4(0.0f, 0.0f, 0.0f, 0.0f));
|
||||||
|
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
if(kernel_data.film.pass_denoising_data) {
|
||||||
|
kernel_write_denoising_shadow(kg, buffer + kernel_data.film.pass_denoising_data, sample, 0.0f, 0.0f);
|
||||||
|
|
||||||
|
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_COLOR,
|
||||||
|
sample, make_float3(0.0f, 0.0f, 0.0f));
|
||||||
|
|
||||||
|
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_NORMAL,
|
||||||
|
sample, make_float3(0.0f, 0.0f, 0.0f));
|
||||||
|
kernel_write_pass_float3_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_ALBEDO,
|
||||||
|
sample, make_float3(0.0f, 0.0f, 0.0f));
|
||||||
|
kernel_write_pass_float_variance(buffer + kernel_data.film.pass_denoising_data + DENOISING_PASS_DEPTH,
|
||||||
|
sample, 0.0f);
|
||||||
|
|
||||||
|
if(kernel_data.film.pass_denoising_clean) {
|
||||||
|
kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_clean,
|
||||||
|
sample, make_float3(0.0f, 0.0f, 0.0f));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
|
@@ -90,10 +90,10 @@ ccl_device_noinline void kernel_path_ao(KernelGlobals *kg,
|
|||||||
light_ray.dD = differential3_zero();
|
light_ray.dD = differential3_zero();
|
||||||
|
|
||||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
|
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
|
||||||
path_radiance_accum_ao(L, throughput, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
|
path_radiance_accum_ao(L, state, throughput, ao_alpha, ao_bsdf, ao_shadow);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
path_radiance_accum_total_ao(L, throughput, ao_bsdf);
|
path_radiance_accum_total_ao(L, state, throughput, ao_bsdf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -366,6 +366,8 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
|
|||||||
throughput /= probability;
|
throughput /= probability;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel_update_denoising_features(kg, sd, state, L);
|
||||||
|
|
||||||
#ifdef __AO__
|
#ifdef __AO__
|
||||||
/* ambient occlusion */
|
/* ambient occlusion */
|
||||||
if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
|
if(kernel_data.integrator.use_ambient_occlusion || (sd->flag & SD_AO)) {
|
||||||
@@ -427,18 +429,19 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
ccl_device_inline float kernel_path_integrate(KernelGlobals *kg,
|
||||||
RNG *rng,
|
RNG *rng,
|
||||||
int sample,
|
int sample,
|
||||||
Ray ray,
|
Ray ray,
|
||||||
ccl_global float *buffer)
|
ccl_global float *buffer,
|
||||||
|
PathRadiance *L,
|
||||||
|
bool *is_shadow_catcher)
|
||||||
{
|
{
|
||||||
/* initialize */
|
/* initialize */
|
||||||
PathRadiance L;
|
|
||||||
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
|
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
|
||||||
float L_transparent = 0.0f;
|
float L_transparent = 0.0f;
|
||||||
|
|
||||||
path_radiance_init(&L, kernel_data.film.use_light_pass);
|
path_radiance_init(L, kernel_data.film.use_light_pass);
|
||||||
|
|
||||||
/* shader data memory used for both volumes and surfaces, saves stack space */
|
/* shader data memory used for both volumes and surfaces, saves stack space */
|
||||||
ShaderData sd;
|
ShaderData sd;
|
||||||
@@ -517,7 +520,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
float3 emission;
|
float3 emission;
|
||||||
|
|
||||||
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
|
if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
|
||||||
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
|
path_radiance_accum_emission(L, throughput, emission, state.bounce);
|
||||||
}
|
}
|
||||||
#endif /* __LAMP_MIS__ */
|
#endif /* __LAMP_MIS__ */
|
||||||
|
|
||||||
@@ -549,7 +552,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
|
|
||||||
/* emission */
|
/* emission */
|
||||||
if(volume_segment.closure_flag & SD_EMISSION)
|
if(volume_segment.closure_flag & SD_EMISSION)
|
||||||
path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
|
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
|
||||||
|
|
||||||
/* scattering */
|
/* scattering */
|
||||||
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
|
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
|
||||||
@@ -559,7 +562,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
|
|
||||||
/* direct light sampling */
|
/* direct light sampling */
|
||||||
kernel_branched_path_volume_connect_light(kg, rng, &sd,
|
kernel_branched_path_volume_connect_light(kg, rng, &sd,
|
||||||
&emission_sd, throughput, &state, &L, all,
|
&emission_sd, throughput, &state, L, all,
|
||||||
&volume_ray, &volume_segment);
|
&volume_ray, &volume_segment);
|
||||||
|
|
||||||
/* indirect sample. if we use distance sampling and take just
|
/* indirect sample. if we use distance sampling and take just
|
||||||
@@ -577,7 +580,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
kernel_volume_decoupled_free(kg, &volume_segment);
|
kernel_volume_decoupled_free(kg, &volume_segment);
|
||||||
|
|
||||||
if(result == VOLUME_PATH_SCATTERED) {
|
if(result == VOLUME_PATH_SCATTERED) {
|
||||||
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
|
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
|
||||||
continue;
|
continue;
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
@@ -591,15 +594,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
{
|
{
|
||||||
/* integrate along volume segment with distance sampling */
|
/* integrate along volume segment with distance sampling */
|
||||||
VolumeIntegrateResult result = kernel_volume_integrate(
|
VolumeIntegrateResult result = kernel_volume_integrate(
|
||||||
kg, &state, &sd, &volume_ray, &L, &throughput, rng, heterogeneous);
|
kg, &state, &sd, &volume_ray, L, &throughput, rng, heterogeneous);
|
||||||
|
|
||||||
# ifdef __VOLUME_SCATTER__
|
# ifdef __VOLUME_SCATTER__
|
||||||
if(result == VOLUME_PATH_SCATTERED) {
|
if(result == VOLUME_PATH_SCATTERED) {
|
||||||
/* direct lighting */
|
/* direct lighting */
|
||||||
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
|
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
|
||||||
|
|
||||||
/* indirect light bounce */
|
/* indirect light bounce */
|
||||||
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
|
if(kernel_path_volume_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
|
||||||
continue;
|
continue;
|
||||||
else
|
else
|
||||||
break;
|
break;
|
||||||
@@ -623,7 +626,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
#ifdef __BACKGROUND__
|
#ifdef __BACKGROUND__
|
||||||
/* sample background shader */
|
/* sample background shader */
|
||||||
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
|
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
|
||||||
path_radiance_accum_background(&L, &state, throughput, L_background);
|
path_radiance_accum_background(L, &state, throughput, L_background);
|
||||||
#endif /* __BACKGROUND__ */
|
#endif /* __BACKGROUND__ */
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@@ -640,10 +643,10 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
#ifdef __SHADOW_TRICKS__
|
#ifdef __SHADOW_TRICKS__
|
||||||
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
|
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
|
||||||
if(state.flag & PATH_RAY_CAMERA) {
|
if(state.flag & PATH_RAY_CAMERA) {
|
||||||
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
|
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
|
||||||
state.catcher_object = sd.object;
|
state.catcher_object = sd.object;
|
||||||
if(!kernel_data.background.transparent) {
|
if(!kernel_data.background.transparent) {
|
||||||
L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
|
L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -677,7 +680,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
#endif /* __HOLDOUT__ */
|
#endif /* __HOLDOUT__ */
|
||||||
|
|
||||||
/* holdout mask objects do not write data passes */
|
/* holdout mask objects do not write data passes */
|
||||||
kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
|
kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
|
||||||
|
|
||||||
/* blurring of bsdf after bounces, for rays that have a small likelihood
|
/* blurring of bsdf after bounces, for rays that have a small likelihood
|
||||||
* of following this particular path (diffuse, rough glossy) */
|
* of following this particular path (diffuse, rough glossy) */
|
||||||
@@ -695,7 +698,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
if(sd.flag & SD_EMISSION) {
|
if(sd.flag & SD_EMISSION) {
|
||||||
/* todo: is isect.t wrong here for transparent surfaces? */
|
/* todo: is isect.t wrong here for transparent surfaces? */
|
||||||
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
|
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
|
||||||
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
|
path_radiance_accum_emission(L, throughput, emission, state.bounce);
|
||||||
}
|
}
|
||||||
#endif /* __EMISSION__ */
|
#endif /* __EMISSION__ */
|
||||||
|
|
||||||
@@ -715,10 +718,12 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
throughput /= probability;
|
throughput /= probability;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel_update_denoising_features(kg, &sd, &state, L);
|
||||||
|
|
||||||
#ifdef __AO__
|
#ifdef __AO__
|
||||||
/* ambient occlusion */
|
/* ambient occlusion */
|
||||||
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
|
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
|
||||||
kernel_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
|
kernel_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput, shader_bsdf_alpha(kg, &sd));
|
||||||
}
|
}
|
||||||
#endif /* __AO__ */
|
#endif /* __AO__ */
|
||||||
|
|
||||||
@@ -729,7 +734,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
if(kernel_path_subsurface_scatter(kg,
|
if(kernel_path_subsurface_scatter(kg,
|
||||||
&sd,
|
&sd,
|
||||||
&emission_sd,
|
&emission_sd,
|
||||||
&L,
|
L,
|
||||||
&state,
|
&state,
|
||||||
rng,
|
rng,
|
||||||
&ray,
|
&ray,
|
||||||
@@ -742,15 +747,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
#endif /* __SUBSURFACE__ */
|
#endif /* __SUBSURFACE__ */
|
||||||
|
|
||||||
/* direct lighting */
|
/* direct lighting */
|
||||||
kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, &L);
|
kernel_path_surface_connect_light(kg, rng, &sd, &emission_sd, throughput, &state, L);
|
||||||
|
|
||||||
/* compute direct lighting and next bounce */
|
/* compute direct lighting and next bounce */
|
||||||
if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, &L, &ray))
|
if(!kernel_path_surface_bounce(kg, rng, &sd, &throughput, &state, L, &ray))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __SUBSURFACE__
|
#ifdef __SUBSURFACE__
|
||||||
kernel_path_subsurface_accum_indirect(&ss_indirect, &L);
|
kernel_path_subsurface_accum_indirect(&ss_indirect, L);
|
||||||
|
|
||||||
/* Trace indirect subsurface rays by restarting the loop. this uses less
|
/* Trace indirect subsurface rays by restarting the loop. this uses less
|
||||||
* stack memory than invoking kernel_path_indirect.
|
* stack memory than invoking kernel_path_indirect.
|
||||||
@@ -760,7 +765,7 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
&ss_indirect,
|
&ss_indirect,
|
||||||
&state,
|
&state,
|
||||||
&ray,
|
&ray,
|
||||||
&L,
|
L,
|
||||||
&throughput);
|
&throughput);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@@ -769,24 +774,15 @@ ccl_device_inline float4 kernel_path_integrate(KernelGlobals *kg,
|
|||||||
}
|
}
|
||||||
#endif /* __SUBSURFACE__ */
|
#endif /* __SUBSURFACE__ */
|
||||||
|
|
||||||
float3 L_sum;
|
|
||||||
#ifdef __SHADOW_TRICKS__
|
#ifdef __SHADOW_TRICKS__
|
||||||
if(state.flag & PATH_RAY_SHADOW_CATCHER) {
|
*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
|
||||||
L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif /* __SHADOW_TRICKS__ */
|
#endif /* __SHADOW_TRICKS__ */
|
||||||
{
|
|
||||||
L_sum = path_radiance_clamp_and_sum(kg, &L);
|
|
||||||
}
|
|
||||||
|
|
||||||
kernel_write_light_passes(kg, buffer, &L, sample);
|
|
||||||
|
|
||||||
#ifdef __KERNEL_DEBUG__
|
#ifdef __KERNEL_DEBUG__
|
||||||
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
|
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
|
||||||
#endif /* __KERNEL_DEBUG__ */
|
#endif /* __KERNEL_DEBUG__ */
|
||||||
|
|
||||||
return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
|
return 1.0f - L_transparent;
|
||||||
}
|
}
|
||||||
|
|
||||||
ccl_device void kernel_path_trace(KernelGlobals *kg,
|
ccl_device void kernel_path_trace(KernelGlobals *kg,
|
||||||
@@ -807,15 +803,16 @@ ccl_device void kernel_path_trace(KernelGlobals *kg,
|
|||||||
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
|
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
|
||||||
|
|
||||||
/* integrate */
|
/* integrate */
|
||||||
float4 L;
|
PathRadiance L;
|
||||||
|
bool is_shadow_catcher;
|
||||||
|
|
||||||
if(ray.t != 0.0f)
|
if(ray.t != 0.0f) {
|
||||||
L = kernel_path_integrate(kg, &rng, sample, ray, buffer);
|
float alpha = kernel_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
|
||||||
else
|
kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
|
||||||
L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
}
|
||||||
|
else {
|
||||||
/* accumulate result in output buffer */
|
kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
|
||||||
kernel_write_pass_float4(buffer, sample, L);
|
}
|
||||||
|
|
||||||
path_rng_end(kg, rng_state, rng);
|
path_rng_end(kg, rng_state, rng);
|
||||||
}
|
}
|
||||||
|
@@ -56,10 +56,10 @@ ccl_device_inline void kernel_branched_path_ao(KernelGlobals *kg,
|
|||||||
light_ray.dD = differential3_zero();
|
light_ray.dD = differential3_zero();
|
||||||
|
|
||||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
|
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &ao_shadow)) {
|
||||||
path_radiance_accum_ao(L, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow, state->bounce);
|
path_radiance_accum_ao(L, state, throughput*num_samples_inv, ao_alpha, ao_bsdf, ao_shadow);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
path_radiance_accum_total_ao(L, throughput*num_samples_inv, ao_bsdf);
|
path_radiance_accum_total_ao(L, state, throughput*num_samples_inv, ao_bsdf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -72,14 +72,32 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
|
|||||||
RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
|
RNG *rng, ShaderData *sd, ShaderData *indirect_sd, ShaderData *emission_sd,
|
||||||
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
|
float3 throughput, float num_samples_adjust, PathState *state, PathRadiance *L)
|
||||||
{
|
{
|
||||||
|
float sum_sample_weight = 0.0f;
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
if(state->denoising_feature_weight > 0.0f) {
|
||||||
for(int i = 0; i < sd->num_closure; i++) {
|
for(int i = 0; i < sd->num_closure; i++) {
|
||||||
const ShaderClosure *sc = &sd->closure[i];
|
const ShaderClosure *sc = &sd->closure[i];
|
||||||
|
|
||||||
if(!CLOSURE_IS_BSDF(sc->type))
|
|
||||||
continue;
|
|
||||||
/* transparency is not handled here, but in outer loop */
|
/* transparency is not handled here, but in outer loop */
|
||||||
if(sc->type == CLOSURE_BSDF_TRANSPARENT_ID)
|
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
sum_sample_weight += sc->sample_weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
sum_sample_weight = 1.0f;
|
||||||
|
}
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
|
|
||||||
|
for(int i = 0; i < sd->num_closure; i++) {
|
||||||
|
const ShaderClosure *sc = &sd->closure[i];
|
||||||
|
|
||||||
|
/* transparency is not handled here, but in outer loop */
|
||||||
|
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
int num_samples;
|
int num_samples;
|
||||||
|
|
||||||
@@ -111,7 +129,8 @@ ccl_device_noinline void kernel_branched_path_surface_indirect_light(KernelGloba
|
|||||||
&tp,
|
&tp,
|
||||||
&ps,
|
&ps,
|
||||||
L,
|
L,
|
||||||
&bsdf_ray))
|
&bsdf_ray,
|
||||||
|
sum_sample_weight))
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -243,14 +262,19 @@ ccl_device void kernel_branched_path_subsurface_scatter(KernelGlobals *kg,
|
|||||||
}
|
}
|
||||||
#endif /* __SUBSURFACE__ */
|
#endif /* __SUBSURFACE__ */
|
||||||
|
|
||||||
ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, int sample, Ray ray, ccl_global float *buffer)
|
ccl_device float kernel_branched_path_integrate(KernelGlobals *kg,
|
||||||
|
RNG *rng,
|
||||||
|
int sample,
|
||||||
|
Ray ray,
|
||||||
|
ccl_global float *buffer,
|
||||||
|
PathRadiance *L,
|
||||||
|
bool *is_shadow_catcher)
|
||||||
{
|
{
|
||||||
/* initialize */
|
/* initialize */
|
||||||
PathRadiance L;
|
|
||||||
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
|
float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
|
||||||
float L_transparent = 0.0f;
|
float L_transparent = 0.0f;
|
||||||
|
|
||||||
path_radiance_init(&L, kernel_data.film.use_light_pass);
|
path_radiance_init(L, kernel_data.film.use_light_pass);
|
||||||
|
|
||||||
/* shader data memory used for both volumes and surfaces, saves stack space */
|
/* shader data memory used for both volumes and surfaces, saves stack space */
|
||||||
ShaderData sd;
|
ShaderData sd;
|
||||||
@@ -330,7 +354,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
int all = kernel_data.integrator.sample_all_lights_direct;
|
int all = kernel_data.integrator.sample_all_lights_direct;
|
||||||
|
|
||||||
kernel_branched_path_volume_connect_light(kg, rng, &sd,
|
kernel_branched_path_volume_connect_light(kg, rng, &sd,
|
||||||
&emission_sd, throughput, &state, &L, all,
|
&emission_sd, throughput, &state, L, all,
|
||||||
&volume_ray, &volume_segment);
|
&volume_ray, &volume_segment);
|
||||||
|
|
||||||
/* indirect light sampling */
|
/* indirect light sampling */
|
||||||
@@ -362,7 +386,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
&sd,
|
&sd,
|
||||||
&tp,
|
&tp,
|
||||||
&ps,
|
&ps,
|
||||||
&L,
|
L,
|
||||||
&pray))
|
&pray))
|
||||||
{
|
{
|
||||||
kernel_path_indirect(kg,
|
kernel_path_indirect(kg,
|
||||||
@@ -373,19 +397,19 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
tp*num_samples_inv,
|
tp*num_samples_inv,
|
||||||
num_samples,
|
num_samples,
|
||||||
&ps,
|
&ps,
|
||||||
&L);
|
L);
|
||||||
|
|
||||||
/* for render passes, sum and reset indirect light pass variables
|
/* for render passes, sum and reset indirect light pass variables
|
||||||
* for the next samples */
|
* for the next samples */
|
||||||
path_radiance_sum_indirect(&L);
|
path_radiance_sum_indirect(L);
|
||||||
path_radiance_reset_indirect(&L);
|
path_radiance_reset_indirect(L);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* emission and transmittance */
|
/* emission and transmittance */
|
||||||
if(volume_segment.closure_flag & SD_EMISSION)
|
if(volume_segment.closure_flag & SD_EMISSION)
|
||||||
path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
|
path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
|
||||||
throughput *= volume_segment.accum_transmittance;
|
throughput *= volume_segment.accum_transmittance;
|
||||||
|
|
||||||
/* free cached steps */
|
/* free cached steps */
|
||||||
@@ -407,20 +431,20 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
path_state_branch(&ps, j, num_samples);
|
path_state_branch(&ps, j, num_samples);
|
||||||
|
|
||||||
VolumeIntegrateResult result = kernel_volume_integrate(
|
VolumeIntegrateResult result = kernel_volume_integrate(
|
||||||
kg, &ps, &sd, &volume_ray, &L, &tp, rng, heterogeneous);
|
kg, &ps, &sd, &volume_ray, L, &tp, rng, heterogeneous);
|
||||||
|
|
||||||
#ifdef __VOLUME_SCATTER__
|
#ifdef __VOLUME_SCATTER__
|
||||||
if(result == VOLUME_PATH_SCATTERED) {
|
if(result == VOLUME_PATH_SCATTERED) {
|
||||||
/* todo: support equiangular, MIS and all light sampling.
|
/* todo: support equiangular, MIS and all light sampling.
|
||||||
* alternatively get decoupled ray marching working on the GPU */
|
* alternatively get decoupled ray marching working on the GPU */
|
||||||
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, &L);
|
kernel_path_volume_connect_light(kg, rng, &sd, &emission_sd, tp, &state, L);
|
||||||
|
|
||||||
if(kernel_path_volume_bounce(kg,
|
if(kernel_path_volume_bounce(kg,
|
||||||
rng,
|
rng,
|
||||||
&sd,
|
&sd,
|
||||||
&tp,
|
&tp,
|
||||||
&ps,
|
&ps,
|
||||||
&L,
|
L,
|
||||||
&pray))
|
&pray))
|
||||||
{
|
{
|
||||||
kernel_path_indirect(kg,
|
kernel_path_indirect(kg,
|
||||||
@@ -431,12 +455,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
tp,
|
tp,
|
||||||
num_samples,
|
num_samples,
|
||||||
&ps,
|
&ps,
|
||||||
&L);
|
L);
|
||||||
|
|
||||||
/* for render passes, sum and reset indirect light pass variables
|
/* for render passes, sum and reset indirect light pass variables
|
||||||
* for the next samples */
|
* for the next samples */
|
||||||
path_radiance_sum_indirect(&L);
|
path_radiance_sum_indirect(L);
|
||||||
path_radiance_reset_indirect(&L);
|
path_radiance_reset_indirect(L);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* __VOLUME_SCATTER__ */
|
#endif /* __VOLUME_SCATTER__ */
|
||||||
@@ -462,7 +486,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
#ifdef __BACKGROUND__
|
#ifdef __BACKGROUND__
|
||||||
/* sample background shader */
|
/* sample background shader */
|
||||||
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
|
float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
|
||||||
path_radiance_accum_background(&L, &state, throughput, L_background);
|
path_radiance_accum_background(L, &state, throughput, L_background);
|
||||||
#endif /* __BACKGROUND__ */
|
#endif /* __BACKGROUND__ */
|
||||||
|
|
||||||
break;
|
break;
|
||||||
@@ -476,10 +500,10 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
#ifdef __SHADOW_TRICKS__
|
#ifdef __SHADOW_TRICKS__
|
||||||
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
|
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
|
||||||
if(state.flag & PATH_RAY_CAMERA) {
|
if(state.flag & PATH_RAY_CAMERA) {
|
||||||
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
|
state.flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
|
||||||
state.catcher_object = sd.object;
|
state.catcher_object = sd.object;
|
||||||
if(!kernel_data.background.transparent) {
|
if(!kernel_data.background.transparent) {
|
||||||
L.shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
|
L->shadow_color = indirect_background(kg, &emission_sd, &state, &ray);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -509,13 +533,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
#endif /* __HOLDOUT__ */
|
#endif /* __HOLDOUT__ */
|
||||||
|
|
||||||
/* holdout mask objects do not write data passes */
|
/* holdout mask objects do not write data passes */
|
||||||
kernel_write_data_passes(kg, buffer, &L, &sd, sample, &state, throughput);
|
kernel_write_data_passes(kg, buffer, L, &sd, sample, &state, throughput);
|
||||||
|
|
||||||
#ifdef __EMISSION__
|
#ifdef __EMISSION__
|
||||||
/* emission */
|
/* emission */
|
||||||
if(sd.flag & SD_EMISSION) {
|
if(sd.flag & SD_EMISSION) {
|
||||||
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
|
float3 emission = indirect_primitive_emission(kg, &sd, isect.t, state.flag, state.ray_pdf);
|
||||||
path_radiance_accum_emission(&L, throughput, emission, state.bounce);
|
path_radiance_accum_emission(L, throughput, emission, state.bounce);
|
||||||
}
|
}
|
||||||
#endif /* __EMISSION__ */
|
#endif /* __EMISSION__ */
|
||||||
|
|
||||||
@@ -539,10 +563,12 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel_update_denoising_features(kg, &sd, &state, L);
|
||||||
|
|
||||||
#ifdef __AO__
|
#ifdef __AO__
|
||||||
/* ambient occlusion */
|
/* ambient occlusion */
|
||||||
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
|
if(kernel_data.integrator.use_ambient_occlusion || (sd.flag & SD_AO)) {
|
||||||
kernel_branched_path_ao(kg, &sd, &emission_sd, &L, &state, rng, throughput);
|
kernel_branched_path_ao(kg, &sd, &emission_sd, L, &state, rng, throughput);
|
||||||
}
|
}
|
||||||
#endif /* __AO__ */
|
#endif /* __AO__ */
|
||||||
|
|
||||||
@@ -550,7 +576,7 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
/* bssrdf scatter to a different location on the same object */
|
/* bssrdf scatter to a different location on the same object */
|
||||||
if(sd.flag & SD_BSSRDF) {
|
if(sd.flag & SD_BSSRDF) {
|
||||||
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
|
kernel_branched_path_subsurface_scatter(kg, &sd, &indirect_sd, &emission_sd,
|
||||||
&L, &state, rng, &ray, throughput);
|
L, &state, rng, &ray, throughput);
|
||||||
}
|
}
|
||||||
#endif /* __SUBSURFACE__ */
|
#endif /* __SUBSURFACE__ */
|
||||||
|
|
||||||
@@ -563,13 +589,13 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
int all = (kernel_data.integrator.sample_all_lights_direct) ||
|
int all = (kernel_data.integrator.sample_all_lights_direct) ||
|
||||||
(state.flag & PATH_RAY_SHADOW_CATCHER);
|
(state.flag & PATH_RAY_SHADOW_CATCHER);
|
||||||
kernel_branched_path_surface_connect_light(kg, rng,
|
kernel_branched_path_surface_connect_light(kg, rng,
|
||||||
&sd, &emission_sd, &hit_state, throughput, 1.0f, &L, all);
|
&sd, &emission_sd, &hit_state, throughput, 1.0f, L, all);
|
||||||
}
|
}
|
||||||
#endif /* __EMISSION__ */
|
#endif /* __EMISSION__ */
|
||||||
|
|
||||||
/* indirect light */
|
/* indirect light */
|
||||||
kernel_branched_path_surface_indirect_light(kg, rng,
|
kernel_branched_path_surface_indirect_light(kg, rng,
|
||||||
&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, &L);
|
&sd, &indirect_sd, &emission_sd, throughput, 1.0f, &hit_state, L);
|
||||||
|
|
||||||
/* continue in case of transparency */
|
/* continue in case of transparency */
|
||||||
throughput *= shader_bsdf_transparency(kg, &sd);
|
throughput *= shader_bsdf_transparency(kg, &sd);
|
||||||
@@ -598,24 +624,15 @@ ccl_device float4 kernel_branched_path_integrate(KernelGlobals *kg, RNG *rng, in
|
|||||||
#endif /* __VOLUME__ */
|
#endif /* __VOLUME__ */
|
||||||
}
|
}
|
||||||
|
|
||||||
float3 L_sum;
|
|
||||||
#ifdef __SHADOW_TRICKS__
|
#ifdef __SHADOW_TRICKS__
|
||||||
if(state.flag & PATH_RAY_SHADOW_CATCHER) {
|
*is_shadow_catcher = (state.flag & PATH_RAY_SHADOW_CATCHER);
|
||||||
L_sum = path_radiance_sum_shadowcatcher(kg, &L, &L_transparent);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif /* __SHADOW_TRICKS__ */
|
#endif /* __SHADOW_TRICKS__ */
|
||||||
{
|
|
||||||
L_sum = path_radiance_clamp_and_sum(kg, &L);
|
|
||||||
}
|
|
||||||
|
|
||||||
kernel_write_light_passes(kg, buffer, &L, sample);
|
|
||||||
|
|
||||||
#ifdef __KERNEL_DEBUG__
|
#ifdef __KERNEL_DEBUG__
|
||||||
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
|
kernel_write_debug_passes(kg, buffer, &state, &debug_data, sample);
|
||||||
#endif /* __KERNEL_DEBUG__ */
|
#endif /* __KERNEL_DEBUG__ */
|
||||||
|
|
||||||
return make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - L_transparent);
|
return 1.0f - L_transparent;
|
||||||
}
|
}
|
||||||
|
|
||||||
ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
|
ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
|
||||||
@@ -636,15 +653,16 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
|
|||||||
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
|
kernel_path_trace_setup(kg, rng_state, sample, x, y, &rng, &ray);
|
||||||
|
|
||||||
/* integrate */
|
/* integrate */
|
||||||
float4 L;
|
PathRadiance L;
|
||||||
|
bool is_shadow_catcher;
|
||||||
|
|
||||||
if(ray.t != 0.0f)
|
if(ray.t != 0.0f) {
|
||||||
L = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer);
|
float alpha = kernel_branched_path_integrate(kg, &rng, sample, ray, buffer, &L, &is_shadow_catcher);
|
||||||
else
|
kernel_write_result(kg, buffer, sample, &L, alpha, is_shadow_catcher);
|
||||||
L = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
|
}
|
||||||
|
else {
|
||||||
/* accumulate result in output buffer */
|
kernel_write_result(kg, buffer, sample, NULL, 0.0f, false);
|
||||||
kernel_write_pass_float4(buffer, sample, L);
|
}
|
||||||
|
|
||||||
path_rng_end(kg, rng_state, rng);
|
path_rng_end(kg, rng_state, rng);
|
||||||
}
|
}
|
||||||
@@ -654,4 +672,3 @@ ccl_device void kernel_branched_path_trace(KernelGlobals *kg,
|
|||||||
#endif /* __BRANCHED_PATH__ */
|
#endif /* __BRANCHED_PATH__ */
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
|
||||||
|
@@ -35,6 +35,16 @@ ccl_device_inline void path_state_init(KernelGlobals *kg,
|
|||||||
state->transmission_bounce = 0;
|
state->transmission_bounce = 0;
|
||||||
state->transparent_bounce = 0;
|
state->transparent_bounce = 0;
|
||||||
|
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
if(kernel_data.film.pass_denoising_data) {
|
||||||
|
state->flag |= PATH_RAY_STORE_SHADOW_INFO;
|
||||||
|
state->denoising_feature_weight = 1.0f;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
state->denoising_feature_weight = 0.0f;
|
||||||
|
}
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
|
|
||||||
state->min_ray_pdf = FLT_MAX;
|
state->min_ray_pdf = FLT_MAX;
|
||||||
state->ray_pdf = 0.0f;
|
state->ray_pdf = 0.0f;
|
||||||
#ifdef __LAMP_MIS__
|
#ifdef __LAMP_MIS__
|
||||||
@@ -128,6 +138,10 @@ ccl_device_inline void path_state_next(KernelGlobals *kg, ccl_addr_space PathSta
|
|||||||
|
|
||||||
/* random number generator next bounce */
|
/* random number generator next bounce */
|
||||||
state->rng_offset += PRNG_BOUNCE_NUM;
|
state->rng_offset += PRNG_BOUNCE_NUM;
|
||||||
|
|
||||||
|
if((state->denoising_feature_weight == 0.0f) && !(state->flag & PATH_RAY_SHADOW_CATCHER)) {
|
||||||
|
state->flag &= ~PATH_RAY_STORE_SHADOW_INFO;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
|
ccl_device_inline uint path_state_ray_visibility(KernelGlobals *kg, PathState *state)
|
||||||
|
@@ -70,10 +70,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
|
|||||||
|
|
||||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||||
/* accumulate */
|
/* accumulate */
|
||||||
path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
|
path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
|
path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -107,10 +107,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
|
|||||||
|
|
||||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||||
/* accumulate */
|
/* accumulate */
|
||||||
path_radiance_accum_light(L, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
|
path_radiance_accum_light(L, state, throughput*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
path_radiance_accum_total_light(L, throughput*num_samples_inv, &L_light);
|
path_radiance_accum_total_light(L, state, throughput*num_samples_inv, &L_light);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -133,10 +133,10 @@ ccl_device_noinline void kernel_branched_path_surface_connect_light(
|
|||||||
|
|
||||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||||
/* accumulate */
|
/* accumulate */
|
||||||
path_radiance_accum_light(L, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, state->bounce, is_lamp);
|
path_radiance_accum_light(L, state, throughput*num_samples_adjust, &L_light, shadow, num_samples_adjust, is_lamp);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
path_radiance_accum_total_light(L, throughput*num_samples_adjust, &L_light);
|
path_radiance_accum_total_light(L, state, throughput*num_samples_adjust, &L_light);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -155,7 +155,8 @@ ccl_device bool kernel_branched_path_surface_bounce(
|
|||||||
ccl_addr_space float3 *throughput,
|
ccl_addr_space float3 *throughput,
|
||||||
ccl_addr_space PathState *state,
|
ccl_addr_space PathState *state,
|
||||||
PathRadiance *L,
|
PathRadiance *L,
|
||||||
ccl_addr_space Ray *ray)
|
ccl_addr_space Ray *ray,
|
||||||
|
float sum_sample_weight)
|
||||||
{
|
{
|
||||||
/* sample BSDF */
|
/* sample BSDF */
|
||||||
float bsdf_pdf;
|
float bsdf_pdf;
|
||||||
@@ -175,6 +176,10 @@ ccl_device bool kernel_branched_path_surface_bounce(
|
|||||||
/* modify throughput */
|
/* modify throughput */
|
||||||
path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
|
path_radiance_bsdf_bounce(L, throughput, &bsdf_eval, bsdf_pdf, state->bounce, label);
|
||||||
|
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
state->denoising_feature_weight *= sc->sample_weight / (sum_sample_weight * num_samples);
|
||||||
|
#endif
|
||||||
|
|
||||||
/* modify path state */
|
/* modify path state */
|
||||||
path_state_next(kg, state, label);
|
path_state_next(kg, state, label);
|
||||||
|
|
||||||
@@ -257,10 +262,10 @@ ccl_device_inline void kernel_path_surface_connect_light(KernelGlobals *kg, RNG
|
|||||||
|
|
||||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||||
/* accumulate */
|
/* accumulate */
|
||||||
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
|
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
path_radiance_accum_total_light(L, throughput, &L_light);
|
path_radiance_accum_total_light(L, state, throughput, &L_light);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -55,7 +55,7 @@ ccl_device_inline void kernel_path_volume_connect_light(
|
|||||||
|
|
||||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||||
/* accumulate */
|
/* accumulate */
|
||||||
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
|
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -184,7 +184,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
|
|||||||
|
|
||||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||||
/* accumulate */
|
/* accumulate */
|
||||||
path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
|
path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -233,7 +233,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
|
|||||||
|
|
||||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||||
/* accumulate */
|
/* accumulate */
|
||||||
path_radiance_accum_light(L, tp*num_samples_inv, &L_light, shadow, num_samples_inv, state->bounce, is_lamp);
|
path_radiance_accum_light(L, state, tp*num_samples_inv, &L_light, shadow, num_samples_inv, is_lamp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -271,7 +271,7 @@ ccl_device void kernel_branched_path_volume_connect_light(
|
|||||||
|
|
||||||
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
if(!shadow_blocked(kg, emission_sd, state, &light_ray, &shadow)) {
|
||||||
/* accumulate */
|
/* accumulate */
|
||||||
path_radiance_accum_light(L, tp, &L_light, shadow, 1.0f, state->bounce, is_lamp);
|
path_radiance_accum_light(L, state, tp, &L_light, shadow, 1.0f, is_lamp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -57,6 +57,9 @@ ccl_device float3 spherical_to_direction(float theta, float phi)
|
|||||||
|
|
||||||
ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
|
ccl_device float2 direction_to_equirectangular_range(float3 dir, float4 range)
|
||||||
{
|
{
|
||||||
|
if(is_zero(dir))
|
||||||
|
return make_float2(0.0f, 0.0f);
|
||||||
|
|
||||||
float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
|
float u = (atan2f(dir.y, dir.x) - range.y) / range.x;
|
||||||
float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
|
float v = (acosf(dir.z / len(dir)) - range.w) / range.z;
|
||||||
|
|
||||||
|
@@ -99,7 +99,7 @@ ccl_device_noinline void shader_setup_from_ray(KernelGlobals *kg,
|
|||||||
|
|
||||||
/* smooth normal */
|
/* smooth normal */
|
||||||
if(sd->shader & SHADER_SMOOTH_NORMAL)
|
if(sd->shader & SHADER_SMOOTH_NORMAL)
|
||||||
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
|
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
|
||||||
|
|
||||||
#ifdef __DPDU__
|
#ifdef __DPDU__
|
||||||
/* dPdu/dPdv */
|
/* dPdu/dPdv */
|
||||||
@@ -186,7 +186,7 @@ void shader_setup_from_subsurface(
|
|||||||
sd->N = Ng;
|
sd->N = Ng;
|
||||||
|
|
||||||
if(sd->shader & SHADER_SMOOTH_NORMAL)
|
if(sd->shader & SHADER_SMOOTH_NORMAL)
|
||||||
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
|
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
|
||||||
|
|
||||||
# ifdef __DPDU__
|
# ifdef __DPDU__
|
||||||
/* dPdu/dPdv */
|
/* dPdu/dPdv */
|
||||||
@@ -300,7 +300,7 @@ ccl_device_inline void shader_setup_from_sample(KernelGlobals *kg,
|
|||||||
if(sd->type & PRIMITIVE_TRIANGLE) {
|
if(sd->type & PRIMITIVE_TRIANGLE) {
|
||||||
/* smooth normal */
|
/* smooth normal */
|
||||||
if(sd->shader & SHADER_SMOOTH_NORMAL) {
|
if(sd->shader & SHADER_SMOOTH_NORMAL) {
|
||||||
sd->N = triangle_smooth_normal(kg, sd->prim, sd->u, sd->v);
|
sd->N = triangle_smooth_normal(kg, Ng, sd->prim, sd->u, sd->v);
|
||||||
|
|
||||||
#ifdef __INSTANCING__
|
#ifdef __INSTANCING__
|
||||||
if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
|
if(!(sd->object_flag & SD_OBJECT_TRANSFORM_APPLIED)) {
|
||||||
|
@@ -173,6 +173,8 @@ CCL_NAMESPACE_BEGIN
|
|||||||
#define __PATCH_EVAL__
|
#define __PATCH_EVAL__
|
||||||
#define __SHADOW_TRICKS__
|
#define __SHADOW_TRICKS__
|
||||||
|
|
||||||
|
#define __DENOISING_FEATURES__
|
||||||
|
|
||||||
#ifdef __KERNEL_SHADING__
|
#ifdef __KERNEL_SHADING__
|
||||||
# define __SVM__
|
# define __SVM__
|
||||||
# define __EMISSION__
|
# define __EMISSION__
|
||||||
@@ -314,31 +316,32 @@ enum SamplingPattern {
|
|||||||
/* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
|
/* these flags values correspond to raytypes in osl.cpp, so keep them in sync! */
|
||||||
|
|
||||||
enum PathRayFlag {
|
enum PathRayFlag {
|
||||||
PATH_RAY_CAMERA = 1,
|
PATH_RAY_CAMERA = (1 << 0),
|
||||||
PATH_RAY_REFLECT = 2,
|
PATH_RAY_REFLECT = (1 << 1),
|
||||||
PATH_RAY_TRANSMIT = 4,
|
PATH_RAY_TRANSMIT = (1 << 2),
|
||||||
PATH_RAY_DIFFUSE = 8,
|
PATH_RAY_DIFFUSE = (1 << 3),
|
||||||
PATH_RAY_GLOSSY = 16,
|
PATH_RAY_GLOSSY = (1 << 4),
|
||||||
PATH_RAY_SINGULAR = 32,
|
PATH_RAY_SINGULAR = (1 << 5),
|
||||||
PATH_RAY_TRANSPARENT = 64,
|
PATH_RAY_TRANSPARENT = (1 << 6),
|
||||||
|
|
||||||
PATH_RAY_SHADOW_OPAQUE = 128,
|
PATH_RAY_SHADOW_OPAQUE = (1 << 7),
|
||||||
PATH_RAY_SHADOW_TRANSPARENT = 256,
|
PATH_RAY_SHADOW_TRANSPARENT = (1 << 8),
|
||||||
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
|
PATH_RAY_SHADOW = (PATH_RAY_SHADOW_OPAQUE|PATH_RAY_SHADOW_TRANSPARENT),
|
||||||
|
|
||||||
PATH_RAY_CURVE = 512, /* visibility flag to define curve segments */
|
PATH_RAY_CURVE = (1 << 9), /* visibility flag to define curve segments */
|
||||||
PATH_RAY_VOLUME_SCATTER = 1024, /* volume scattering */
|
PATH_RAY_VOLUME_SCATTER = (1 << 10), /* volume scattering */
|
||||||
|
|
||||||
/* Special flag to tag unaligned BVH nodes. */
|
/* Special flag to tag unaligned BVH nodes. */
|
||||||
PATH_RAY_NODE_UNALIGNED = 2048,
|
PATH_RAY_NODE_UNALIGNED = (1 << 11),
|
||||||
|
|
||||||
PATH_RAY_ALL_VISIBILITY = (1|2|4|8|16|32|64|128|256|512|1024|2048),
|
PATH_RAY_ALL_VISIBILITY = ((1 << 12)-1),
|
||||||
|
|
||||||
PATH_RAY_MIS_SKIP = 4096,
|
PATH_RAY_MIS_SKIP = (1 << 12),
|
||||||
PATH_RAY_DIFFUSE_ANCESTOR = 8192,
|
PATH_RAY_DIFFUSE_ANCESTOR = (1 << 13),
|
||||||
PATH_RAY_SINGLE_PASS_DONE = 16384,
|
PATH_RAY_SINGLE_PASS_DONE = (1 << 14),
|
||||||
PATH_RAY_SHADOW_CATCHER = 32768,
|
PATH_RAY_SHADOW_CATCHER = (1 << 15),
|
||||||
PATH_RAY_SHADOW_CATCHER_ONLY = 65536,
|
PATH_RAY_SHADOW_CATCHER_ONLY = (1 << 16),
|
||||||
|
PATH_RAY_STORE_SHADOW_INFO = (1 << 17),
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Closure Label */
|
/* Closure Label */
|
||||||
@@ -394,6 +397,22 @@ typedef enum PassType {
|
|||||||
|
|
||||||
#define PASS_ALL (~0)
|
#define PASS_ALL (~0)
|
||||||
|
|
||||||
|
typedef enum DenoisingPassOffsets {
|
||||||
|
DENOISING_PASS_NORMAL = 0,
|
||||||
|
DENOISING_PASS_NORMAL_VAR = 3,
|
||||||
|
DENOISING_PASS_ALBEDO = 6,
|
||||||
|
DENOISING_PASS_ALBEDO_VAR = 9,
|
||||||
|
DENOISING_PASS_DEPTH = 12,
|
||||||
|
DENOISING_PASS_DEPTH_VAR = 13,
|
||||||
|
DENOISING_PASS_SHADOW_A = 14,
|
||||||
|
DENOISING_PASS_SHADOW_B = 17,
|
||||||
|
DENOISING_PASS_COLOR = 20,
|
||||||
|
DENOISING_PASS_COLOR_VAR = 23,
|
||||||
|
|
||||||
|
DENOISING_PASS_SIZE_BASE = 26,
|
||||||
|
DENOISING_PASS_SIZE_CLEAN = 3,
|
||||||
|
} DenoisingPassOffsets;
|
||||||
|
|
||||||
typedef enum BakePassFilter {
|
typedef enum BakePassFilter {
|
||||||
BAKE_FILTER_NONE = 0,
|
BAKE_FILTER_NONE = 0,
|
||||||
BAKE_FILTER_DIRECT = (1 << 0),
|
BAKE_FILTER_DIRECT = (1 << 0),
|
||||||
@@ -427,6 +446,18 @@ typedef enum BakePassFilterCombos {
|
|||||||
BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
|
BAKE_FILTER_SUBSURFACE_INDIRECT = (BAKE_FILTER_INDIRECT | BAKE_FILTER_SUBSURFACE),
|
||||||
} BakePassFilterCombos;
|
} BakePassFilterCombos;
|
||||||
|
|
||||||
|
typedef enum DenoiseFlag {
|
||||||
|
DENOISING_CLEAN_DIFFUSE_DIR = (1 << 0),
|
||||||
|
DENOISING_CLEAN_DIFFUSE_IND = (1 << 1),
|
||||||
|
DENOISING_CLEAN_GLOSSY_DIR = (1 << 2),
|
||||||
|
DENOISING_CLEAN_GLOSSY_IND = (1 << 3),
|
||||||
|
DENOISING_CLEAN_TRANSMISSION_DIR = (1 << 4),
|
||||||
|
DENOISING_CLEAN_TRANSMISSION_IND = (1 << 5),
|
||||||
|
DENOISING_CLEAN_SUBSURFACE_DIR = (1 << 6),
|
||||||
|
DENOISING_CLEAN_SUBSURFACE_IND = (1 << 7),
|
||||||
|
DENOISING_CLEAN_ALL_PASSES = (1 << 8)-1,
|
||||||
|
} DenoiseFlag;
|
||||||
|
|
||||||
typedef ccl_addr_space struct PathRadiance {
|
typedef ccl_addr_space struct PathRadiance {
|
||||||
#ifdef __PASSES__
|
#ifdef __PASSES__
|
||||||
int use_light_pass;
|
int use_light_pass;
|
||||||
@@ -482,6 +513,12 @@ typedef ccl_addr_space struct PathRadiance {
|
|||||||
/* Color of the background on which shadow is alpha-overed. */
|
/* Color of the background on which shadow is alpha-overed. */
|
||||||
float3 shadow_color;
|
float3 shadow_color;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
float3 denoising_normal;
|
||||||
|
float3 denoising_albedo;
|
||||||
|
float denoising_depth;
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
} PathRadiance;
|
} PathRadiance;
|
||||||
|
|
||||||
typedef struct BsdfEval {
|
typedef struct BsdfEval {
|
||||||
@@ -724,12 +761,13 @@ typedef struct AttributeDescriptor {
|
|||||||
#define SHADER_CLOSURE_BASE \
|
#define SHADER_CLOSURE_BASE \
|
||||||
float3 weight; \
|
float3 weight; \
|
||||||
ClosureType type; \
|
ClosureType type; \
|
||||||
float sample_weight \
|
float sample_weight; \
|
||||||
|
float3 N
|
||||||
|
|
||||||
typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
|
typedef ccl_addr_space struct ccl_align(16) ShaderClosure {
|
||||||
SHADER_CLOSURE_BASE;
|
SHADER_CLOSURE_BASE;
|
||||||
|
|
||||||
float data[14]; /* pad to 80 bytes */
|
float data[10]; /* pad to 80 bytes */
|
||||||
} ShaderClosure;
|
} ShaderClosure;
|
||||||
|
|
||||||
/* Shader Context
|
/* Shader Context
|
||||||
@@ -960,6 +998,10 @@ typedef struct PathState {
|
|||||||
int transmission_bounce;
|
int transmission_bounce;
|
||||||
int transparent_bounce;
|
int transparent_bounce;
|
||||||
|
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
float denoising_feature_weight;
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
|
|
||||||
/* multiple importance sampling */
|
/* multiple importance sampling */
|
||||||
float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
|
float min_ray_pdf; /* smallest bounce pdf over entire path up to now */
|
||||||
float ray_pdf; /* last bounce pdf */
|
float ray_pdf; /* last bounce pdf */
|
||||||
@@ -1137,6 +1179,11 @@ typedef struct KernelFilm {
|
|||||||
float mist_inv_depth;
|
float mist_inv_depth;
|
||||||
float mist_falloff;
|
float mist_falloff;
|
||||||
|
|
||||||
|
int pass_denoising_data;
|
||||||
|
int pass_denoising_clean;
|
||||||
|
int denoising_flags;
|
||||||
|
int pad;
|
||||||
|
|
||||||
#ifdef __KERNEL_DEBUG__
|
#ifdef __KERNEL_DEBUG__
|
||||||
int pass_bvh_traversed_nodes;
|
int pass_bvh_traversed_nodes;
|
||||||
int pass_bvh_traversed_instances;
|
int pass_bvh_traversed_instances;
|
||||||
|
61
intern/cycles/kernel/kernels/cpu/filter.cpp
Normal file
61
intern/cycles/kernel/kernels/cpu/filter.cpp
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* CPU kernel entry points */
|
||||||
|
|
||||||
|
/* On x86-64, we can assume SSE2, so avoid the extra kernel and compile this
|
||||||
|
* one with SSE2 intrinsics.
|
||||||
|
*/
|
||||||
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* When building kernel for native machine detect kernel features from the flags
|
||||||
|
* set by compiler.
|
||||||
|
*/
|
||||||
|
#ifdef WITH_KERNEL_NATIVE
|
||||||
|
# ifdef __SSE2__
|
||||||
|
# ifndef __KERNEL_SSE2__
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
# ifdef __SSE3__
|
||||||
|
# define __KERNEL_SSE3__
|
||||||
|
# endif
|
||||||
|
# ifdef __SSSE3__
|
||||||
|
# define __KERNEL_SSSE3__
|
||||||
|
# endif
|
||||||
|
# ifdef __SSE4_1__
|
||||||
|
# define __KERNEL_SSE41__
|
||||||
|
# endif
|
||||||
|
# ifdef __AVX__
|
||||||
|
# define __KERNEL_SSE__
|
||||||
|
# define __KERNEL_AVX__
|
||||||
|
# endif
|
||||||
|
# ifdef __AVX2__
|
||||||
|
# define __KERNEL_SSE__
|
||||||
|
# define __KERNEL_AVX2__
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* quiet unused define warnings */
|
||||||
|
#if defined(__KERNEL_SSE2__)
|
||||||
|
/* do nothing */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "kernel/filter/filter.h"
|
||||||
|
#define KERNEL_ARCH cpu
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
39
intern/cycles/kernel/kernels/cpu/filter_avx.cpp
Normal file
39
intern/cycles/kernel/kernels/cpu/filter_avx.cpp
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Optimized CPU kernel entry points. This file is compiled with AVX
|
||||||
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
|
# define __KERNEL_SSE__
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
# define __KERNEL_SSE3__
|
||||||
|
# define __KERNEL_SSSE3__
|
||||||
|
# define __KERNEL_SSE41__
|
||||||
|
# define __KERNEL_AVX__
|
||||||
|
# endif
|
||||||
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
||||||
|
|
||||||
|
#include "kernel/filter/filter.h"
|
||||||
|
#define KERNEL_ARCH cpu_avx
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
40
intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
Normal file
40
intern/cycles/kernel/kernels/cpu/filter_avx2.cpp
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Optimized CPU kernel entry points. This file is compiled with AVX2
|
||||||
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
|
# define __KERNEL_SSE__
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
# define __KERNEL_SSE3__
|
||||||
|
# define __KERNEL_SSSE3__
|
||||||
|
# define __KERNEL_SSE41__
|
||||||
|
# define __KERNEL_AVX__
|
||||||
|
# define __KERNEL_AVX2__
|
||||||
|
# endif
|
||||||
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
||||||
|
|
||||||
|
#include "kernel/filter/filter.h"
|
||||||
|
#define KERNEL_ARCH cpu_avx2
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
132
intern/cycles/kernel/kernels/cpu/filter_cpu.h
Normal file
132
intern/cycles/kernel/kernels/cpu/filter_cpu.h
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Templated common declaration part of all CPU kernels. */
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
|
||||||
|
TilesInfo *tiles,
|
||||||
|
int x,
|
||||||
|
int y,
|
||||||
|
float *unfilteredA,
|
||||||
|
float *unfilteredB,
|
||||||
|
float *sampleV,
|
||||||
|
float *sampleVV,
|
||||||
|
float *bufferV,
|
||||||
|
int* prefilter_rect,
|
||||||
|
int buffer_pass_stride,
|
||||||
|
int buffer_denoising_offset,
|
||||||
|
bool use_split_variance);
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
|
||||||
|
TilesInfo *tiles,
|
||||||
|
int m_offset,
|
||||||
|
int v_offset,
|
||||||
|
int x,
|
||||||
|
int y,
|
||||||
|
float *mean,
|
||||||
|
float *variance,
|
||||||
|
int* prefilter_rect,
|
||||||
|
int buffer_pass_stride,
|
||||||
|
int buffer_denoising_offset,
|
||||||
|
bool use_split_variance);
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
|
||||||
|
float *mean,
|
||||||
|
float *variance,
|
||||||
|
float *a,
|
||||||
|
float *b,
|
||||||
|
int* prefilter_rect,
|
||||||
|
int r);
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
|
||||||
|
int x,
|
||||||
|
int y,
|
||||||
|
int storage_ofs,
|
||||||
|
float *transform,
|
||||||
|
int *rank,
|
||||||
|
int* rect,
|
||||||
|
int pass_stride,
|
||||||
|
int radius,
|
||||||
|
float pca_threshold);
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
|
||||||
|
int dy,
|
||||||
|
float *weightImage,
|
||||||
|
float *variance,
|
||||||
|
float *differenceImage,
|
||||||
|
int* rect,
|
||||||
|
int w,
|
||||||
|
int channel_offset,
|
||||||
|
float a,
|
||||||
|
float k_2);
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
|
||||||
|
float *outImage,
|
||||||
|
int* rect,
|
||||||
|
int w,
|
||||||
|
int f);
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
|
||||||
|
float *outImage,
|
||||||
|
int* rect,
|
||||||
|
int w,
|
||||||
|
int f);
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
|
||||||
|
int dy,
|
||||||
|
float *differenceImage,
|
||||||
|
float *image,
|
||||||
|
float *outImage,
|
||||||
|
float *accumImage,
|
||||||
|
int* rect,
|
||||||
|
int w,
|
||||||
|
int f);
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
|
||||||
|
int dy,
|
||||||
|
float *differenceImage,
|
||||||
|
float *buffer,
|
||||||
|
float *color_pass,
|
||||||
|
float *variance_pass,
|
||||||
|
float *transform,
|
||||||
|
int *rank,
|
||||||
|
float *XtWX,
|
||||||
|
float3 *XtWY,
|
||||||
|
int *rect,
|
||||||
|
int *filter_rect,
|
||||||
|
int w,
|
||||||
|
int h,
|
||||||
|
int f,
|
||||||
|
int pass_stride);
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
|
||||||
|
float *accumImage,
|
||||||
|
int* rect,
|
||||||
|
int w);
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
|
||||||
|
int y,
|
||||||
|
int storage_ofs,
|
||||||
|
int w,
|
||||||
|
int h,
|
||||||
|
float *buffer,
|
||||||
|
int *rank,
|
||||||
|
float *XtWX,
|
||||||
|
float3 *XtWY,
|
||||||
|
int *buffer_params,
|
||||||
|
int sample);
|
||||||
|
|
||||||
|
#undef KERNEL_ARCH
|
259
intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
Normal file
259
intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Templated common implementation part of all CPU kernels.
|
||||||
|
*
|
||||||
|
* The idea is that particular .cpp files sets needed optimization flags and
|
||||||
|
* simply includes this file without worry of copying actual implementation over.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "kernel/kernel_compat_cpu.h"
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_kernel.h"
|
||||||
|
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
# include "util/util_debug.h"
|
||||||
|
# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
|
|
||||||
|
/* Denoise filter */
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
|
||||||
|
TilesInfo *tiles,
|
||||||
|
int x,
|
||||||
|
int y,
|
||||||
|
float *unfilteredA,
|
||||||
|
float *unfilteredB,
|
||||||
|
float *sampleVariance,
|
||||||
|
float *sampleVarianceV,
|
||||||
|
float *bufferVariance,
|
||||||
|
int* prefilter_rect,
|
||||||
|
int buffer_pass_stride,
|
||||||
|
int buffer_denoising_offset,
|
||||||
|
bool use_split_variance)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
|
||||||
|
#else
|
||||||
|
kernel_filter_divide_shadow(sample, tiles,
|
||||||
|
x, y,
|
||||||
|
unfilteredA,
|
||||||
|
unfilteredB,
|
||||||
|
sampleVariance,
|
||||||
|
sampleVarianceV,
|
||||||
|
bufferVariance,
|
||||||
|
load_int4(prefilter_rect),
|
||||||
|
buffer_pass_stride,
|
||||||
|
buffer_denoising_offset,
|
||||||
|
use_split_variance);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
|
||||||
|
TilesInfo *tiles,
|
||||||
|
int m_offset,
|
||||||
|
int v_offset,
|
||||||
|
int x,
|
||||||
|
int y,
|
||||||
|
float *mean, float *variance,
|
||||||
|
int* prefilter_rect,
|
||||||
|
int buffer_pass_stride,
|
||||||
|
int buffer_denoising_offset,
|
||||||
|
bool use_split_variance)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
|
||||||
|
#else
|
||||||
|
kernel_filter_get_feature(sample, tiles,
|
||||||
|
m_offset, v_offset,
|
||||||
|
x, y,
|
||||||
|
mean, variance,
|
||||||
|
load_int4(prefilter_rect),
|
||||||
|
buffer_pass_stride,
|
||||||
|
buffer_denoising_offset,
|
||||||
|
use_split_variance);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
|
||||||
|
float *mean,
|
||||||
|
float *variance,
|
||||||
|
float *a,
|
||||||
|
float *b,
|
||||||
|
int* prefilter_rect,
|
||||||
|
int r)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
|
||||||
|
#else
|
||||||
|
kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
|
||||||
|
int x,
|
||||||
|
int y,
|
||||||
|
int storage_ofs,
|
||||||
|
float *transform,
|
||||||
|
int *rank,
|
||||||
|
int* prefilter_rect,
|
||||||
|
int pass_stride,
|
||||||
|
int radius,
|
||||||
|
float pca_threshold)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
|
||||||
|
#else
|
||||||
|
rank += storage_ofs;
|
||||||
|
transform += storage_ofs*TRANSFORM_SIZE;
|
||||||
|
kernel_filter_construct_transform(buffer,
|
||||||
|
x, y,
|
||||||
|
load_int4(prefilter_rect),
|
||||||
|
pass_stride,
|
||||||
|
transform,
|
||||||
|
rank,
|
||||||
|
radius,
|
||||||
|
pca_threshold);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
|
||||||
|
int dy,
|
||||||
|
float *weightImage,
|
||||||
|
float *variance,
|
||||||
|
float *differenceImage,
|
||||||
|
int *rect,
|
||||||
|
int w,
|
||||||
|
int channel_offset,
|
||||||
|
float a,
|
||||||
|
float k_2)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
|
||||||
|
#else
|
||||||
|
kernel_filter_nlm_calc_difference(dx, dy, weightImage, variance, differenceImage, load_int4(rect), w, channel_offset, a, k_2);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *differenceImage,
|
||||||
|
float *outImage,
|
||||||
|
int *rect,
|
||||||
|
int w,
|
||||||
|
int f)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
|
||||||
|
#else
|
||||||
|
kernel_filter_nlm_blur(differenceImage, outImage, load_int4(rect), w, f);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *differenceImage,
|
||||||
|
float *outImage,
|
||||||
|
int *rect,
|
||||||
|
int w,
|
||||||
|
int f)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
|
||||||
|
#else
|
||||||
|
kernel_filter_nlm_calc_weight(differenceImage, outImage, load_int4(rect), w, f);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
|
||||||
|
int dy,
|
||||||
|
float *differenceImage,
|
||||||
|
float *image,
|
||||||
|
float *outImage,
|
||||||
|
float *accumImage,
|
||||||
|
int *rect,
|
||||||
|
int w,
|
||||||
|
int f)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
|
||||||
|
#else
|
||||||
|
kernel_filter_nlm_update_output(dx, dy, differenceImage, image, outImage, accumImage, load_int4(rect), w, f);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
|
||||||
|
int dy,
|
||||||
|
float *differenceImage,
|
||||||
|
float *buffer,
|
||||||
|
float *color_pass,
|
||||||
|
float *variance_pass,
|
||||||
|
float *transform,
|
||||||
|
int *rank,
|
||||||
|
float *XtWX,
|
||||||
|
float3 *XtWY,
|
||||||
|
int *rect,
|
||||||
|
int *filter_rect,
|
||||||
|
int w,
|
||||||
|
int h,
|
||||||
|
int f,
|
||||||
|
int pass_stride)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
|
||||||
|
#else
|
||||||
|
kernel_filter_nlm_construct_gramian(dx, dy, differenceImage, buffer, color_pass, variance_pass, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *outImage,
|
||||||
|
float *accumImage,
|
||||||
|
int *rect,
|
||||||
|
int w)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
|
||||||
|
#else
|
||||||
|
kernel_filter_nlm_normalize(outImage, accumImage, load_int4(rect), w);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
|
||||||
|
int y,
|
||||||
|
int storage_ofs,
|
||||||
|
int w,
|
||||||
|
int h,
|
||||||
|
float *buffer,
|
||||||
|
int *rank,
|
||||||
|
float *XtWX,
|
||||||
|
float3 *XtWY,
|
||||||
|
int *buffer_params,
|
||||||
|
int sample)
|
||||||
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, filter_finalize);
|
||||||
|
#else
|
||||||
|
XtWX += storage_ofs*XTWX_SIZE;
|
||||||
|
XtWY += storage_ofs*XTWY_SIZE;
|
||||||
|
rank += storage_ofs;
|
||||||
|
kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef KERNEL_STUB
|
||||||
|
#undef STUB_ASSERT
|
||||||
|
#undef KERNEL_ARCH
|
||||||
|
|
||||||
|
CCL_NAMESPACE_END
|
34
intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
Normal file
34
intern/cycles/kernel/kernels/cpu/filter_sse2.cpp
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Optimized CPU kernel entry points. This file is compiled with SSE2
|
||||||
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
# endif
|
||||||
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
||||||
|
|
||||||
|
#include "kernel/filter/filter.h"
|
||||||
|
#define KERNEL_ARCH cpu_sse2
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
36
intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
Normal file
36
intern/cycles/kernel/kernels/cpu/filter_sse3.cpp
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
|
||||||
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
# define __KERNEL_SSE3__
|
||||||
|
# define __KERNEL_SSSE3__
|
||||||
|
# endif
|
||||||
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
|
||||||
|
|
||||||
|
#include "kernel/filter/filter.h"
|
||||||
|
#define KERNEL_ARCH cpu_sse3
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
37
intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
Normal file
37
intern/cycles/kernel/kernels/cpu/filter_sse41.cpp
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Optimized CPU kernel entry points. This file is compiled with SSE3/SSSE3
|
||||||
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
# define __KERNEL_SSE3__
|
||||||
|
# define __KERNEL_SSSE3__
|
||||||
|
# define __KERNEL_SSE41__
|
||||||
|
# endif
|
||||||
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
||||||
|
|
||||||
|
#include "kernel/filter/filter.h"
|
||||||
|
#define KERNEL_ARCH cpu_sse41
|
||||||
|
#include "kernel/kernels/cpu/filter_cpu_impl.h"
|
@@ -18,20 +18,22 @@
|
|||||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
* is compiled without for other CPU's. */
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
# define __KERNEL_SSE__
|
# define __KERNEL_SSE__
|
||||||
# define __KERNEL_SSE2__
|
# define __KERNEL_SSE2__
|
||||||
# define __KERNEL_SSE3__
|
# define __KERNEL_SSE3__
|
||||||
# define __KERNEL_SSSE3__
|
# define __KERNEL_SSSE3__
|
||||||
# define __KERNEL_SSE41__
|
# define __KERNEL_SSE41__
|
||||||
# define __KERNEL_AVX__
|
# define __KERNEL_AVX__
|
||||||
#endif
|
# endif
|
||||||
|
|
||||||
#include "util/util_optimization.h"
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
|
||||||
# include "kernel/kernel.h"
|
|
||||||
# define KERNEL_ARCH cpu_avx
|
|
||||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
||||||
|
|
||||||
|
#include "kernel/kernel.h"
|
||||||
|
#define KERNEL_ARCH cpu_avx
|
||||||
|
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||||
|
@@ -18,8 +18,13 @@
|
|||||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
* is compiled without for other CPU's. */
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
# define __KERNEL_SSE__
|
# define __KERNEL_SSE__
|
||||||
# define __KERNEL_SSE2__
|
# define __KERNEL_SSE2__
|
||||||
# define __KERNEL_SSE3__
|
# define __KERNEL_SSE3__
|
||||||
@@ -27,12 +32,9 @@
|
|||||||
# define __KERNEL_SSE41__
|
# define __KERNEL_SSE41__
|
||||||
# define __KERNEL_AVX__
|
# define __KERNEL_AVX__
|
||||||
# define __KERNEL_AVX2__
|
# define __KERNEL_AVX2__
|
||||||
#endif
|
# endif
|
||||||
|
|
||||||
#include "util/util_optimization.h"
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
|
||||||
# include "kernel/kernel.h"
|
|
||||||
# define KERNEL_ARCH cpu_avx2
|
|
||||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
||||||
|
|
||||||
|
#include "kernel/kernel.h"
|
||||||
|
#define KERNEL_ARCH cpu_avx2
|
||||||
|
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||||
|
@@ -89,6 +89,4 @@ DECLARE_SPLIT_KERNEL_FUNCTION(next_iteration_setup)
|
|||||||
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
|
DECLARE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
|
||||||
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
|
DECLARE_SPLIT_KERNEL_FUNCTION(buffer_update)
|
||||||
|
|
||||||
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func));
|
|
||||||
|
|
||||||
#undef KERNEL_ARCH
|
#undef KERNEL_ARCH
|
||||||
|
@@ -57,6 +57,11 @@
|
|||||||
# include "kernel/split/kernel_buffer_update.h"
|
# include "kernel/split/kernel_buffer_update.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
# include "util/util_debug.h"
|
||||||
|
# define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
|
||||||
|
#endif
|
||||||
|
|
||||||
CCL_NAMESPACE_BEGIN
|
CCL_NAMESPACE_BEGIN
|
||||||
|
|
||||||
#ifndef __SPLIT_KERNEL__
|
#ifndef __SPLIT_KERNEL__
|
||||||
@@ -71,7 +76,10 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
|
|||||||
int offset,
|
int offset,
|
||||||
int stride)
|
int stride)
|
||||||
{
|
{
|
||||||
#ifdef __BRANCHED_PATH__
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, path_trace);
|
||||||
|
#else
|
||||||
|
# ifdef __BRANCHED_PATH__
|
||||||
if(kernel_data.integrator.branched) {
|
if(kernel_data.integrator.branched) {
|
||||||
kernel_branched_path_trace(kg,
|
kernel_branched_path_trace(kg,
|
||||||
buffer,
|
buffer,
|
||||||
@@ -82,10 +90,11 @@ void KERNEL_FUNCTION_FULL_NAME(path_trace)(KernelGlobals *kg,
|
|||||||
stride);
|
stride);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
# endif
|
||||||
{
|
{
|
||||||
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
|
kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
|
||||||
}
|
}
|
||||||
|
#endif /* KERNEL_STUB */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Film */
|
/* Film */
|
||||||
@@ -98,6 +107,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
|
|||||||
int offset,
|
int offset,
|
||||||
int stride)
|
int stride)
|
||||||
{
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, convert_to_byte);
|
||||||
|
#else
|
||||||
kernel_film_convert_to_byte(kg,
|
kernel_film_convert_to_byte(kg,
|
||||||
rgba,
|
rgba,
|
||||||
buffer,
|
buffer,
|
||||||
@@ -105,6 +117,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_byte)(KernelGlobals *kg,
|
|||||||
x, y,
|
x, y,
|
||||||
offset,
|
offset,
|
||||||
stride);
|
stride);
|
||||||
|
#endif /* KERNEL_STUB */
|
||||||
}
|
}
|
||||||
|
|
||||||
void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
|
void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
|
||||||
@@ -115,6 +128,9 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
|
|||||||
int offset,
|
int offset,
|
||||||
int stride)
|
int stride)
|
||||||
{
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, convert_to_half_float);
|
||||||
|
#else
|
||||||
kernel_film_convert_to_half_float(kg,
|
kernel_film_convert_to_half_float(kg,
|
||||||
rgba,
|
rgba,
|
||||||
buffer,
|
buffer,
|
||||||
@@ -122,6 +138,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
|
|||||||
x, y,
|
x, y,
|
||||||
offset,
|
offset,
|
||||||
stride);
|
stride);
|
||||||
|
#endif /* KERNEL_STUB */
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Shader Evaluate */
|
/* Shader Evaluate */
|
||||||
@@ -136,9 +153,12 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
|||||||
int offset,
|
int offset,
|
||||||
int sample)
|
int sample)
|
||||||
{
|
{
|
||||||
|
#ifdef KERNEL_STUB
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, shader);
|
||||||
|
#else
|
||||||
if(type >= SHADER_EVAL_BAKE) {
|
if(type >= SHADER_EVAL_BAKE) {
|
||||||
kernel_assert(output_luma == NULL);
|
kernel_assert(output_luma == NULL);
|
||||||
#ifdef __BAKING__
|
# ifdef __BAKING__
|
||||||
kernel_bake_evaluate(kg,
|
kernel_bake_evaluate(kg,
|
||||||
input,
|
input,
|
||||||
output,
|
output,
|
||||||
@@ -147,7 +167,7 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
|||||||
i,
|
i,
|
||||||
offset,
|
offset,
|
||||||
sample);
|
sample);
|
||||||
#endif
|
# endif
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
kernel_shader_evaluate(kg,
|
kernel_shader_evaluate(kg,
|
||||||
@@ -158,17 +178,26 @@ void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
|
|||||||
i,
|
i,
|
||||||
sample);
|
sample);
|
||||||
}
|
}
|
||||||
|
#endif /* KERNEL_STUB */
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* __SPLIT_KERNEL__ */
|
#else /* __SPLIT_KERNEL__ */
|
||||||
|
|
||||||
/* Split Kernel Path Tracing */
|
/* Split Kernel Path Tracing */
|
||||||
|
|
||||||
#define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
#ifdef KERNEL_STUB
|
||||||
|
# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
||||||
|
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
|
||||||
|
{ \
|
||||||
|
STUB_ASSERT(KERNEL_ARCH, name); \
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
# define DEFINE_SPLIT_KERNEL_FUNCTION(name) \
|
||||||
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
|
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
|
||||||
{ \
|
{ \
|
||||||
kernel_##name(kg); \
|
kernel_##name(kg); \
|
||||||
}
|
}
|
||||||
|
#endif /* KERNEL_STUB */
|
||||||
|
|
||||||
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
|
#define DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(name, type) \
|
||||||
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
|
void KERNEL_FUNCTION_FULL_NAME(name)(KernelGlobals *kg, KernelData* /*data*/) \
|
||||||
@@ -194,42 +223,10 @@ DEFINE_SPLIT_KERNEL_FUNCTION(shadow_blocked_dl)
|
|||||||
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
|
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(next_iteration_setup, uint)
|
||||||
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
|
DEFINE_SPLIT_KERNEL_FUNCTION(indirect_subsurface)
|
||||||
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
|
DEFINE_SPLIT_KERNEL_FUNCTION_LOCALS(buffer_update, uint)
|
||||||
|
|
||||||
void KERNEL_FUNCTION_FULL_NAME(register_functions)(void(*reg)(const char* name, void* func))
|
|
||||||
{
|
|
||||||
#define REGISTER_NAME_STRING(name) #name
|
|
||||||
#define REGISTER_EVAL_NAME(name) REGISTER_NAME_STRING(name)
|
|
||||||
#define REGISTER(name) reg(REGISTER_EVAL_NAME(KERNEL_FUNCTION_FULL_NAME(name)), (void*)KERNEL_FUNCTION_FULL_NAME(name));
|
|
||||||
|
|
||||||
REGISTER(path_trace);
|
|
||||||
REGISTER(convert_to_byte);
|
|
||||||
REGISTER(convert_to_half_float);
|
|
||||||
REGISTER(shader);
|
|
||||||
|
|
||||||
REGISTER(data_init);
|
|
||||||
REGISTER(path_init);
|
|
||||||
REGISTER(scene_intersect);
|
|
||||||
REGISTER(lamp_emission);
|
|
||||||
REGISTER(do_volume);
|
|
||||||
REGISTER(queue_enqueue);
|
|
||||||
REGISTER(indirect_background);
|
|
||||||
REGISTER(shader_setup);
|
|
||||||
REGISTER(shader_sort);
|
|
||||||
REGISTER(shader_eval);
|
|
||||||
REGISTER(holdout_emission_blurring_pathtermination_ao);
|
|
||||||
REGISTER(subsurface_scatter);
|
|
||||||
REGISTER(direct_lighting);
|
|
||||||
REGISTER(shadow_blocked_ao);
|
|
||||||
REGISTER(shadow_blocked_dl);
|
|
||||||
REGISTER(next_iteration_setup);
|
|
||||||
REGISTER(indirect_subsurface);
|
|
||||||
REGISTER(buffer_update);
|
|
||||||
|
|
||||||
#undef REGISTER
|
|
||||||
#undef REGISTER_EVAL_NAME
|
|
||||||
#undef REGISTER_NAME_STRING
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* __SPLIT_KERNEL__ */
|
#endif /* __SPLIT_KERNEL__ */
|
||||||
|
|
||||||
|
#undef KERNEL_STUB
|
||||||
|
#undef STUB_ASSERT
|
||||||
|
#undef KERNEL_ARCH
|
||||||
|
|
||||||
CCL_NAMESPACE_END
|
CCL_NAMESPACE_END
|
||||||
|
@@ -18,21 +18,24 @@
|
|||||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
* is compiled without for other CPU's. */
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#define __SPLIT_KERNEL__
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
|
# define __KERNEL_SSE__
|
||||||
# define __KERNEL_SSE2__
|
# define __KERNEL_SSE2__
|
||||||
# define __KERNEL_SSE3__
|
# define __KERNEL_SSE3__
|
||||||
# define __KERNEL_SSSE3__
|
# define __KERNEL_SSSE3__
|
||||||
# define __KERNEL_SSE41__
|
# define __KERNEL_SSE41__
|
||||||
# define __KERNEL_AVX__
|
# define __KERNEL_AVX__
|
||||||
#endif
|
# endif
|
||||||
|
|
||||||
#define __SPLIT_KERNEL__
|
|
||||||
|
|
||||||
#include "util/util_optimization.h"
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
|
|
||||||
# include "kernel/kernel.h"
|
|
||||||
# define KERNEL_ARCH cpu_avx
|
|
||||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX */
|
||||||
|
|
||||||
|
#include "kernel/kernel.h"
|
||||||
|
#define KERNEL_ARCH cpu_avx
|
||||||
|
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||||
|
@@ -18,8 +18,15 @@
|
|||||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
* is compiled without for other CPU's. */
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#define __SPLIT_KERNEL__
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
# define __KERNEL_SSE__
|
# define __KERNEL_SSE__
|
||||||
# define __KERNEL_SSE2__
|
# define __KERNEL_SSE2__
|
||||||
# define __KERNEL_SSE3__
|
# define __KERNEL_SSE3__
|
||||||
@@ -27,14 +34,9 @@
|
|||||||
# define __KERNEL_SSE41__
|
# define __KERNEL_SSE41__
|
||||||
# define __KERNEL_AVX__
|
# define __KERNEL_AVX__
|
||||||
# define __KERNEL_AVX2__
|
# define __KERNEL_AVX2__
|
||||||
#endif
|
# endif
|
||||||
|
|
||||||
#define __SPLIT_KERNEL__
|
|
||||||
|
|
||||||
#include "util/util_optimization.h"
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
|
|
||||||
# include "kernel/kernel.h"
|
|
||||||
# define KERNEL_ARCH cpu_avx2
|
|
||||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_AVX2 */
|
||||||
|
|
||||||
|
#include "kernel/kernel.h"
|
||||||
|
#define KERNEL_ARCH cpu_avx2
|
||||||
|
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||||
|
@@ -18,17 +18,19 @@
|
|||||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
* is compiled without for other CPU's. */
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
|
||||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
|
||||||
# define __KERNEL_SSE2__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define __SPLIT_KERNEL__
|
#define __SPLIT_KERNEL__
|
||||||
|
|
||||||
#include "util/util_optimization.h"
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||||
# include "kernel/kernel.h"
|
# define KERNEL_STUB
|
||||||
# define KERNEL_ARCH cpu_sse2
|
#else
|
||||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
# endif
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
||||||
|
|
||||||
|
#include "kernel/kernel.h"
|
||||||
|
#define KERNEL_ARCH cpu_sse2
|
||||||
|
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||||
|
@@ -18,19 +18,21 @@
|
|||||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
* is compiled without for other CPU's. */
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
|
||||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
|
||||||
# define __KERNEL_SSE2__
|
|
||||||
# define __KERNEL_SSE3__
|
|
||||||
# define __KERNEL_SSSE3__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define __SPLIT_KERNEL__
|
#define __SPLIT_KERNEL__
|
||||||
|
|
||||||
#include "util/util_optimization.h"
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||||
# include "kernel/kernel.h"
|
# define KERNEL_STUB
|
||||||
# define KERNEL_ARCH cpu_sse3
|
#else
|
||||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
# define __KERNEL_SSE3__
|
||||||
|
# define __KERNEL_SSSE3__
|
||||||
|
# endif
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
|
||||||
|
|
||||||
|
#include "kernel/kernel.h"
|
||||||
|
#define KERNEL_ARCH cpu_sse3
|
||||||
|
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||||
|
@@ -18,20 +18,22 @@
|
|||||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
* is compiled without for other CPU's. */
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
|
||||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
|
||||||
# define __KERNEL_SSE2__
|
|
||||||
# define __KERNEL_SSE3__
|
|
||||||
# define __KERNEL_SSSE3__
|
|
||||||
# define __KERNEL_SSE41__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define __SPLIT_KERNEL__
|
#define __SPLIT_KERNEL__
|
||||||
|
|
||||||
#include "util/util_optimization.h"
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||||
# include "kernel/kernel.h"
|
# define KERNEL_STUB
|
||||||
# define KERNEL_ARCH cpu_sse41
|
#else
|
||||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
# define __KERNEL_SSE3__
|
||||||
|
# define __KERNEL_SSSE3__
|
||||||
|
# define __KERNEL_SSE41__
|
||||||
|
# endif
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
||||||
|
|
||||||
|
#include "kernel/kernel.h"
|
||||||
|
#define KERNEL_ARCH cpu_sse41
|
||||||
|
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||||
|
@@ -18,15 +18,17 @@
|
|||||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
* is compiled without for other CPU's. */
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
|
||||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
|
||||||
# define __KERNEL_SSE2__
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "util/util_optimization.h"
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
|
||||||
# include "kernel/kernel.h"
|
# define KERNEL_STUB
|
||||||
# define KERNEL_ARCH cpu_sse2
|
#else
|
||||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
|
# define __KERNEL_SSE2__
|
||||||
|
# endif
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE2 */
|
||||||
|
|
||||||
|
#include "kernel/kernel.h"
|
||||||
|
#define KERNEL_ARCH cpu_sse2
|
||||||
|
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||||
|
@@ -18,17 +18,19 @@
|
|||||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
* is compiled without for other CPU's. */
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
# define __KERNEL_SSE2__
|
# define __KERNEL_SSE2__
|
||||||
# define __KERNEL_SSE3__
|
# define __KERNEL_SSE3__
|
||||||
# define __KERNEL_SSSE3__
|
# define __KERNEL_SSSE3__
|
||||||
#endif
|
# endif
|
||||||
|
|
||||||
#include "util/util_optimization.h"
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
|
|
||||||
# include "kernel/kernel.h"
|
|
||||||
# define KERNEL_ARCH cpu_sse3
|
|
||||||
# include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE3 */
|
||||||
|
|
||||||
|
#include "kernel/kernel.h"
|
||||||
|
#define KERNEL_ARCH cpu_sse3
|
||||||
|
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||||
|
@@ -18,18 +18,20 @@
|
|||||||
* optimization flags and nearly all functions inlined, while kernel.cpp
|
* optimization flags and nearly all functions inlined, while kernel.cpp
|
||||||
* is compiled without for other CPU's. */
|
* is compiled without for other CPU's. */
|
||||||
|
|
||||||
|
#include "util/util_optimization.h"
|
||||||
|
|
||||||
|
#ifndef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
||||||
|
# define KERNEL_STUB
|
||||||
|
#else
|
||||||
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
/* SSE optimization disabled for now on 32 bit, see bug #36316 */
|
||||||
#if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
# if !(defined(__GNUC__) && (defined(i386) || defined(_M_IX86)))
|
||||||
# define __KERNEL_SSE2__
|
# define __KERNEL_SSE2__
|
||||||
# define __KERNEL_SSE3__
|
# define __KERNEL_SSE3__
|
||||||
# define __KERNEL_SSSE3__
|
# define __KERNEL_SSSE3__
|
||||||
# define __KERNEL_SSE41__
|
# define __KERNEL_SSE41__
|
||||||
#endif
|
# endif
|
||||||
|
|
||||||
#include "util/util_optimization.h"
|
|
||||||
|
|
||||||
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
|
|
||||||
# include "kernel/kernel.h"
|
|
||||||
# define KERNEL_ARCH cpu_sse41
|
|
||||||
# include "kernel/kernels/cpu//kernel_cpu_impl.h"
|
|
||||||
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
#endif /* WITH_CYCLES_OPTIMIZED_KERNEL_SSE41 */
|
||||||
|
|
||||||
|
#include "kernel/kernel.h"
|
||||||
|
#define KERNEL_ARCH cpu_sse41
|
||||||
|
#include "kernel/kernels/cpu/kernel_cpu_impl.h"
|
||||||
|
235
intern/cycles/kernel/kernels/cuda/filter.cu
Normal file
235
intern/cycles/kernel/kernels/cuda/filter.cu
Normal file
@@ -0,0 +1,235 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* CUDA kernel entry points */
|
||||||
|
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
|
||||||
|
#include "kernel_config.h"
|
||||||
|
|
||||||
|
#include "kernel/kernel_compat_cuda.h"
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_kernel.h"
|
||||||
|
|
||||||
|
/* kernels */
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_divide_shadow(int sample,
|
||||||
|
TilesInfo *tiles,
|
||||||
|
float *unfilteredA,
|
||||||
|
float *unfilteredB,
|
||||||
|
float *sampleVariance,
|
||||||
|
float *sampleVarianceV,
|
||||||
|
float *bufferVariance,
|
||||||
|
int4 prefilter_rect,
|
||||||
|
int buffer_pass_stride,
|
||||||
|
int buffer_denoising_offset,
|
||||||
|
bool use_split_variance)
|
||||||
|
{
|
||||||
|
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||||
|
kernel_filter_divide_shadow(sample,
|
||||||
|
tiles,
|
||||||
|
x, y,
|
||||||
|
unfilteredA,
|
||||||
|
unfilteredB,
|
||||||
|
sampleVariance,
|
||||||
|
sampleVarianceV,
|
||||||
|
bufferVariance,
|
||||||
|
prefilter_rect,
|
||||||
|
buffer_pass_stride,
|
||||||
|
buffer_denoising_offset,
|
||||||
|
use_split_variance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_get_feature(int sample,
|
||||||
|
TilesInfo *tiles,
|
||||||
|
int m_offset,
|
||||||
|
int v_offset,
|
||||||
|
float *mean,
|
||||||
|
float *variance,
|
||||||
|
int4 prefilter_rect,
|
||||||
|
int buffer_pass_stride,
|
||||||
|
int buffer_denoising_offset,
|
||||||
|
bool use_split_variance)
|
||||||
|
{
|
||||||
|
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||||
|
kernel_filter_get_feature(sample,
|
||||||
|
tiles,
|
||||||
|
m_offset, v_offset,
|
||||||
|
x, y,
|
||||||
|
mean, variance,
|
||||||
|
prefilter_rect,
|
||||||
|
buffer_pass_stride,
|
||||||
|
buffer_denoising_offset,
|
||||||
|
use_split_variance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
|
||||||
|
{
|
||||||
|
int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||||
|
kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
|
||||||
|
float *transform, int *rank,
|
||||||
|
int4 filter_area, int4 rect,
|
||||||
|
int radius, float pca_threshold,
|
||||||
|
int pass_stride)
|
||||||
|
{
|
||||||
|
int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
if(x < filter_area.z && y < filter_area.w) {
|
||||||
|
int *l_rank = rank + y*filter_area.z + x;
|
||||||
|
float *l_transform = transform + y*filter_area.z + x;
|
||||||
|
kernel_filter_construct_transform(buffer,
|
||||||
|
x + filter_area.x, y + filter_area.y,
|
||||||
|
rect, pass_stride,
|
||||||
|
l_transform, l_rank,
|
||||||
|
radius, pca_threshold,
|
||||||
|
filter_area.z*filter_area.w,
|
||||||
|
threadIdx.y*blockDim.x + threadIdx.x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
|
||||||
|
float ccl_restrict_ptr weightImage,
|
||||||
|
float ccl_restrict_ptr varianceImage,
|
||||||
|
float *differenceImage,
|
||||||
|
int4 rect, int w,
|
||||||
|
int channel_offset,
|
||||||
|
float a, float k_2) {
|
||||||
|
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||||
|
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||||
|
if(x < rect.z && y < rect.w) {
|
||||||
|
kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
|
||||||
|
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||||
|
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||||
|
if(x < rect.z && y < rect.w) {
|
||||||
|
kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
|
||||||
|
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||||
|
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||||
|
if(x < rect.z && y < rect.w) {
|
||||||
|
kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_nlm_update_output(int dx, int dy,
|
||||||
|
float ccl_restrict_ptr differenceImage,
|
||||||
|
float ccl_restrict_ptr image,
|
||||||
|
float *outImage, float *accumImage,
|
||||||
|
int4 rect, int w,
|
||||||
|
int f) {
|
||||||
|
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||||
|
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||||
|
if(x < rect.z && y < rect.w) {
|
||||||
|
kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
|
||||||
|
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||||
|
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||||
|
if(x < rect.z && y < rect.w) {
|
||||||
|
kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
|
||||||
|
float ccl_restrict_ptr differenceImage,
|
||||||
|
float ccl_restrict_ptr buffer,
|
||||||
|
float *color_pass,
|
||||||
|
float *variance_pass,
|
||||||
|
float const* __restrict__ transform,
|
||||||
|
int *rank,
|
||||||
|
float *XtWX,
|
||||||
|
float3 *XtWY,
|
||||||
|
int4 rect,
|
||||||
|
int4 filter_rect,
|
||||||
|
int w, int h, int f,
|
||||||
|
int pass_stride) {
|
||||||
|
int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
|
||||||
|
int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
|
||||||
|
if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
|
||||||
|
kernel_filter_nlm_construct_gramian(x, y,
|
||||||
|
dx, dy,
|
||||||
|
differenceImage,
|
||||||
|
buffer,
|
||||||
|
color_pass, variance_pass,
|
||||||
|
transform, rank,
|
||||||
|
XtWX, XtWY,
|
||||||
|
rect, filter_rect,
|
||||||
|
w, h, f,
|
||||||
|
pass_stride,
|
||||||
|
threadIdx.y*blockDim.x + threadIdx.x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" __global__ void
|
||||||
|
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||||
|
kernel_cuda_filter_finalize(int w, int h,
|
||||||
|
float *buffer, int *rank,
|
||||||
|
float *XtWX, float3 *XtWY,
|
||||||
|
int4 filter_area, int4 buffer_params,
|
||||||
|
int sample) {
|
||||||
|
int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
if(x < filter_area.z && y < filter_area.w) {
|
||||||
|
int storage_ofs = y*filter_area.z+x;
|
||||||
|
rank += storage_ofs;
|
||||||
|
XtWX += storage_ofs;
|
||||||
|
XtWY += storage_ofs;
|
||||||
|
kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
262
intern/cycles/kernel/kernels/opencl/filter.cl
Normal file
262
intern/cycles/kernel/kernels/opencl/filter.cl
Normal file
@@ -0,0 +1,262 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2011-2017 Blender Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* OpenCL kernel entry points */
|
||||||
|
|
||||||
|
#include "kernel/kernel_compat_opencl.h"
|
||||||
|
|
||||||
|
#include "kernel/filter/filter_kernel.h"
|
||||||
|
|
||||||
|
/* kernels */
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_divide_shadow(int sample,
|
||||||
|
ccl_global TilesInfo *tiles,
|
||||||
|
ccl_global float *unfilteredA,
|
||||||
|
ccl_global float *unfilteredB,
|
||||||
|
ccl_global float *sampleVariance,
|
||||||
|
ccl_global float *sampleVarianceV,
|
||||||
|
ccl_global float *bufferVariance,
|
||||||
|
int4 prefilter_rect,
|
||||||
|
int buffer_pass_stride,
|
||||||
|
int buffer_denoising_offset,
|
||||||
|
char use_split_variance)
|
||||||
|
{
|
||||||
|
int x = prefilter_rect.x + get_global_id(0);
|
||||||
|
int y = prefilter_rect.y + get_global_id(1);
|
||||||
|
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||||
|
kernel_filter_divide_shadow(sample,
|
||||||
|
tiles,
|
||||||
|
x, y,
|
||||||
|
unfilteredA,
|
||||||
|
unfilteredB,
|
||||||
|
sampleVariance,
|
||||||
|
sampleVarianceV,
|
||||||
|
bufferVariance,
|
||||||
|
prefilter_rect,
|
||||||
|
buffer_pass_stride,
|
||||||
|
buffer_denoising_offset,
|
||||||
|
use_split_variance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_get_feature(int sample,
|
||||||
|
ccl_global TilesInfo *tiles,
|
||||||
|
int m_offset,
|
||||||
|
int v_offset,
|
||||||
|
ccl_global float *mean,
|
||||||
|
ccl_global float *variance,
|
||||||
|
int4 prefilter_rect,
|
||||||
|
int buffer_pass_stride,
|
||||||
|
int buffer_denoising_offset,
|
||||||
|
char use_split_variance)
|
||||||
|
{
|
||||||
|
int x = prefilter_rect.x + get_global_id(0);
|
||||||
|
int y = prefilter_rect.y + get_global_id(1);
|
||||||
|
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||||
|
kernel_filter_get_feature(sample,
|
||||||
|
tiles,
|
||||||
|
m_offset, v_offset,
|
||||||
|
x, y,
|
||||||
|
mean, variance,
|
||||||
|
prefilter_rect,
|
||||||
|
buffer_pass_stride,
|
||||||
|
buffer_denoising_offset,
|
||||||
|
use_split_variance);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
|
||||||
|
ccl_global float *variance,
|
||||||
|
ccl_global float *a,
|
||||||
|
ccl_global float *b,
|
||||||
|
int4 prefilter_rect,
|
||||||
|
int r)
|
||||||
|
{
|
||||||
|
int x = prefilter_rect.x + get_global_id(0);
|
||||||
|
int y = prefilter_rect.y + get_global_id(1);
|
||||||
|
if(x < prefilter_rect.z && y < prefilter_rect.w) {
|
||||||
|
kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_construct_transform(ccl_global float ccl_restrict_ptr buffer,
|
||||||
|
ccl_global float *transform,
|
||||||
|
ccl_global int *rank,
|
||||||
|
int4 filter_area,
|
||||||
|
int4 rect,
|
||||||
|
int pass_stride,
|
||||||
|
int radius,
|
||||||
|
float pca_threshold)
|
||||||
|
{
|
||||||
|
int x = get_global_id(0);
|
||||||
|
int y = get_global_id(1);
|
||||||
|
if(x < filter_area.z && y < filter_area.w) {
|
||||||
|
ccl_global int *l_rank = rank + y*filter_area.z + x;
|
||||||
|
ccl_global float *l_transform = transform + y*filter_area.z + x;
|
||||||
|
kernel_filter_construct_transform(buffer,
|
||||||
|
x + filter_area.x, y + filter_area.y,
|
||||||
|
rect, pass_stride,
|
||||||
|
l_transform, l_rank,
|
||||||
|
radius, pca_threshold,
|
||||||
|
filter_area.z*filter_area.w,
|
||||||
|
get_local_id(1)*get_local_size(0) + get_local_id(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
|
||||||
|
int dy,
|
||||||
|
ccl_global float ccl_restrict_ptr weightImage,
|
||||||
|
ccl_global float ccl_restrict_ptr varianceImage,
|
||||||
|
ccl_global float *differenceImage,
|
||||||
|
int4 rect,
|
||||||
|
int w,
|
||||||
|
int channel_offset,
|
||||||
|
float a,
|
||||||
|
float k_2) {
|
||||||
|
int x = get_global_id(0) + rect.x;
|
||||||
|
int y = get_global_id(1) + rect.y;
|
||||||
|
if(x < rect.z && y < rect.w) {
|
||||||
|
kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_nlm_blur(ccl_global float ccl_restrict_ptr differenceImage,
|
||||||
|
ccl_global float *outImage,
|
||||||
|
int4 rect,
|
||||||
|
int w,
|
||||||
|
int f) {
|
||||||
|
int x = get_global_id(0) + rect.x;
|
||||||
|
int y = get_global_id(1) + rect.y;
|
||||||
|
if(x < rect.z && y < rect.w) {
|
||||||
|
kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_nlm_calc_weight(ccl_global float ccl_restrict_ptr differenceImage,
|
||||||
|
ccl_global float *outImage,
|
||||||
|
int4 rect,
|
||||||
|
int w,
|
||||||
|
int f) {
|
||||||
|
int x = get_global_id(0) + rect.x;
|
||||||
|
int y = get_global_id(1) + rect.y;
|
||||||
|
if(x < rect.z && y < rect.w) {
|
||||||
|
kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_nlm_update_output(int dx,
|
||||||
|
int dy,
|
||||||
|
ccl_global float ccl_restrict_ptr differenceImage,
|
||||||
|
ccl_global float ccl_restrict_ptr image,
|
||||||
|
ccl_global float *outImage,
|
||||||
|
ccl_global float *accumImage,
|
||||||
|
int4 rect,
|
||||||
|
int w,
|
||||||
|
int f) {
|
||||||
|
int x = get_global_id(0) + rect.x;
|
||||||
|
int y = get_global_id(1) + rect.y;
|
||||||
|
if(x < rect.z && y < rect.w) {
|
||||||
|
kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *outImage,
|
||||||
|
ccl_global float ccl_restrict_ptr accumImage,
|
||||||
|
int4 rect,
|
||||||
|
int w) {
|
||||||
|
int x = get_global_id(0) + rect.x;
|
||||||
|
int y = get_global_id(1) + rect.y;
|
||||||
|
if(x < rect.z && y < rect.w) {
|
||||||
|
kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
|
||||||
|
int dy,
|
||||||
|
ccl_global float ccl_restrict_ptr differenceImage,
|
||||||
|
ccl_global float ccl_restrict_ptr buffer,
|
||||||
|
ccl_global float *color_pass,
|
||||||
|
ccl_global float *variance_pass,
|
||||||
|
ccl_global float ccl_restrict_ptr transform,
|
||||||
|
ccl_global int *rank,
|
||||||
|
ccl_global float *XtWX,
|
||||||
|
ccl_global float3 *XtWY,
|
||||||
|
int4 rect,
|
||||||
|
int4 filter_rect,
|
||||||
|
int w,
|
||||||
|
int h,
|
||||||
|
int f,
|
||||||
|
int pass_stride) {
|
||||||
|
int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
|
||||||
|
int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
|
||||||
|
if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
|
||||||
|
kernel_filter_nlm_construct_gramian(x, y,
|
||||||
|
dx, dy,
|
||||||
|
differenceImage,
|
||||||
|
buffer,
|
||||||
|
color_pass, variance_pass,
|
||||||
|
transform, rank,
|
||||||
|
XtWX, XtWY,
|
||||||
|
rect, filter_rect,
|
||||||
|
w, h, f,
|
||||||
|
pass_stride,
|
||||||
|
get_local_id(1)*get_local_size(0) + get_local_id(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_finalize(int w,
|
||||||
|
int h,
|
||||||
|
ccl_global float *buffer,
|
||||||
|
ccl_global int *rank,
|
||||||
|
ccl_global float *XtWX,
|
||||||
|
ccl_global float3 *XtWY,
|
||||||
|
int4 filter_area,
|
||||||
|
int4 buffer_params,
|
||||||
|
int sample) {
|
||||||
|
int x = get_global_id(0);
|
||||||
|
int y = get_global_id(1);
|
||||||
|
if(x < filter_area.z && y < filter_area.w) {
|
||||||
|
int storage_ofs = y*filter_area.z+x;
|
||||||
|
rank += storage_ofs;
|
||||||
|
XtWX += storage_ofs;
|
||||||
|
XtWY += storage_ofs;
|
||||||
|
kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
|
||||||
|
ccl_global float *buffer_1,
|
||||||
|
ccl_global float *buffer_2,
|
||||||
|
ccl_global float *buffer_3,
|
||||||
|
ccl_global float *buffer_4,
|
||||||
|
ccl_global float *buffer_5,
|
||||||
|
ccl_global float *buffer_6,
|
||||||
|
ccl_global float *buffer_7,
|
||||||
|
ccl_global float *buffer_8,
|
||||||
|
ccl_global float *buffer_9)
|
||||||
|
{
|
||||||
|
if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
|
||||||
|
tiles->buffers[0] = buffer_1;
|
||||||
|
tiles->buffers[1] = buffer_2;
|
||||||
|
tiles->buffers[2] = buffer_3;
|
||||||
|
tiles->buffers[3] = buffer_4;
|
||||||
|
tiles->buffers[4] = buffer_5;
|
||||||
|
tiles->buffers[5] = buffer_6;
|
||||||
|
tiles->buffers[6] = buffer_7;
|
||||||
|
tiles->buffers[7] = buffer_8;
|
||||||
|
tiles->buffers[8] = buffer_9;
|
||||||
|
}
|
||||||
|
}
|
@@ -76,6 +76,26 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
|
|||||||
RNG rng = kernel_split_state.rng[ray_index];
|
RNG rng = kernel_split_state.rng[ray_index];
|
||||||
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
|
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
|
||||||
float3 throughput = branched_state->throughput;
|
float3 throughput = branched_state->throughput;
|
||||||
|
ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
|
||||||
|
|
||||||
|
float sum_sample_weight = 0.0f;
|
||||||
|
#ifdef __DENOISING_FEATURES__
|
||||||
|
if(ps->denoising_feature_weight > 0.0f) {
|
||||||
|
for(int i = 0; i < sd->num_closure; i++) {
|
||||||
|
const ShaderClosure *sc = &sd->closure[i];
|
||||||
|
|
||||||
|
/* transparency is not handled here, but in outer loop */
|
||||||
|
if(!CLOSURE_IS_BSDF(sc->type) || CLOSURE_IS_BSDF_TRANSPARENT(sc->type)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
sum_sample_weight += sc->sample_weight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
sum_sample_weight = 1.0f;
|
||||||
|
}
|
||||||
|
#endif /* __DENOISING_FEATURES__ */
|
||||||
|
|
||||||
for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
|
for(int i = branched_state->next_closure; i < sd->num_closure; i++) {
|
||||||
const ShaderClosure *sc = &sd->closure[i];
|
const ShaderClosure *sc = &sd->closure[i];
|
||||||
@@ -103,7 +123,6 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
|
|||||||
RNG bsdf_rng = cmj_hash(rng, i);
|
RNG bsdf_rng = cmj_hash(rng, i);
|
||||||
|
|
||||||
for(int j = branched_state->next_sample; j < num_samples; j++) {
|
for(int j = branched_state->next_sample; j < num_samples; j++) {
|
||||||
ccl_global PathState *ps = &kernel_split_state.path_state[ray_index];
|
|
||||||
if(reset_path_state) {
|
if(reset_path_state) {
|
||||||
*ps = branched_state->path_state;
|
*ps = branched_state->path_state;
|
||||||
}
|
}
|
||||||
@@ -122,7 +141,8 @@ ccl_device_noinline bool kernel_split_branched_path_surface_indirect_light_iter(
|
|||||||
tp,
|
tp,
|
||||||
ps,
|
ps,
|
||||||
L,
|
L,
|
||||||
bsdf_ray))
|
bsdf_ray,
|
||||||
|
sum_sample_weight))
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@@ -111,24 +111,15 @@ ccl_device void kernel_buffer_update(KernelGlobals *kg,
|
|||||||
buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
|
buffer += (kernel_split_params.offset + pixel_x + pixel_y*stride) * kernel_data.film.pass_stride;
|
||||||
|
|
||||||
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
|
if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
|
||||||
float3 L_sum;
|
|
||||||
#ifdef __SHADOW_TRICKS__
|
|
||||||
if(state->flag & PATH_RAY_SHADOW_CATCHER) {
|
|
||||||
L_sum = path_radiance_sum_shadowcatcher(kg, L, L_transparent);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif /* __SHADOW_TRICKS__ */
|
|
||||||
{
|
|
||||||
L_sum = path_radiance_clamp_and_sum(kg, L);
|
|
||||||
}
|
|
||||||
kernel_write_light_passes(kg, buffer, L, sample);
|
kernel_write_light_passes(kg, buffer, L, sample);
|
||||||
#ifdef __KERNEL_DEBUG__
|
#ifdef __KERNEL_DEBUG__
|
||||||
kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
|
kernel_write_debug_passes(kg, buffer, state, debug_data, sample);
|
||||||
#endif
|
#endif
|
||||||
float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
|
|
||||||
|
|
||||||
/* accumulate result in output buffer */
|
/* accumulate result in output buffer */
|
||||||
kernel_write_pass_float4(buffer, sample, L_rad);
|
bool is_shadow_catcher = (state->flag & PATH_RAY_SHADOW_CATCHER);
|
||||||
|
kernel_write_result(kg, buffer, sample, L, 1.0f - (*L_transparent), is_shadow_catcher);
|
||||||
|
|
||||||
path_rng_end(kg, rng_state, rng);
|
path_rng_end(kg, rng_state, rng);
|
||||||
|
|
||||||
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
|
ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
|
||||||
|
@@ -125,7 +125,7 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
|
|||||||
#ifdef __SHADOW_TRICKS__
|
#ifdef __SHADOW_TRICKS__
|
||||||
if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
|
if((sd->object_flag & SD_OBJECT_SHADOW_CATCHER)) {
|
||||||
if(state->flag & PATH_RAY_CAMERA) {
|
if(state->flag & PATH_RAY_CAMERA) {
|
||||||
state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY);
|
state->flag |= (PATH_RAY_SHADOW_CATCHER | PATH_RAY_SHADOW_CATCHER_ONLY | PATH_RAY_STORE_SHADOW_INFO);
|
||||||
state->catcher_object = sd->object;
|
state->catcher_object = sd->object;
|
||||||
if(!kernel_data.background.transparent) {
|
if(!kernel_data.background.transparent) {
|
||||||
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
|
PathRadiance *L = &kernel_split_state.path_radiance[ray_index];
|
||||||
@@ -246,6 +246,8 @@ ccl_device void kernel_holdout_emission_blurring_pathtermination_ao(
|
|||||||
kernel_split_state.throughput[ray_index] = throughput/probability;
|
kernel_split_state.throughput[ray_index] = throughput/probability;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kernel_update_denoising_features(kg, sd, state, L);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -89,10 +89,10 @@ ccl_device void kernel_shadow_blocked_dl(KernelGlobals *kg)
|
|||||||
&shadow))
|
&shadow))
|
||||||
{
|
{
|
||||||
/* accumulate */
|
/* accumulate */
|
||||||
path_radiance_accum_light(L, throughput, &L_light, shadow, 1.0f, state->bounce, is_lamp);
|
path_radiance_accum_light(L, state, throughput, &L_light, shadow, 1.0f, is_lamp);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
path_radiance_accum_total_light(L, throughput, &L_light);
|
path_radiance_accum_total_light(L, state, throughput, &L_light);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -444,6 +444,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
|
|||||||
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
|
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
|
||||||
|
|
||||||
if(bsdf) {
|
if(bsdf) {
|
||||||
|
bsdf->N = N;
|
||||||
sd->flag |= bsdf_transparent_setup(bsdf);
|
sd->flag |= bsdf_transparent_setup(bsdf);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -704,6 +705,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals *kg, ShaderData *sd, float *
|
|||||||
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
|
ShaderClosure *bsdf = bsdf_alloc(sd, sizeof(ShaderClosure), weight);
|
||||||
|
|
||||||
if(bsdf) {
|
if(bsdf) {
|
||||||
|
bsdf->N = N;
|
||||||
/* todo: giving a fixed weight here will cause issues when
|
/* todo: giving a fixed weight here will cause issues when
|
||||||
* mixing multiple BSDFS. energy will not be conserved and
|
* mixing multiple BSDFS. energy will not be conserved and
|
||||||
* the throughput can blow up after multiple bounces. we
|
* the throughput can blow up after multiple bounces. we
|
||||||
|
@@ -63,8 +63,13 @@ ccl_device void svm_node_set_bump(KernelGlobals *kg, ShaderData *sd, float *stac
|
|||||||
strength = max(strength, 0.0f);
|
strength = max(strength, 0.0f);
|
||||||
|
|
||||||
/* compute and output perturbed normal */
|
/* compute and output perturbed normal */
|
||||||
float3 normal_out = normalize(absdet*normal_in - distance*signf(det)*surfgrad);
|
float3 normal_out = safe_normalize(absdet*normal_in - distance*signf(det)*surfgrad);
|
||||||
|
if(is_zero(normal_out)) {
|
||||||
|
normal_out = normal_in;
|
||||||
|
}
|
||||||
|
else {
|
||||||
normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
|
normal_out = normalize(strength*normal_out + (1.0f - strength)*normal_in);
|
||||||
|
}
|
||||||
|
|
||||||
if(use_object_space) {
|
if(use_object_space) {
|
||||||
object_normal_transform(kg, sd, &normal_out);
|
object_normal_transform(kg, sd, &normal_out);
|
||||||
|
@@ -37,6 +37,7 @@ ccl_device_inline void svm_node_geometry(KernelGlobals *kg,
|
|||||||
#ifdef __UV__
|
#ifdef __UV__
|
||||||
case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
|
case NODE_GEOM_uv: data = make_float3(sd->u, sd->v, 0.0f); break;
|
||||||
#endif
|
#endif
|
||||||
|
default: data = make_float3(0.0f, 0.0f, 0.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
stack_store_float3(stack, out_offset, data);
|
stack_store_float3(stack, out_offset, data);
|
||||||
|
@@ -317,7 +317,7 @@ ccl_device void svm_node_tex_environment(KernelGlobals *kg, ShaderData *sd, floa
|
|||||||
float3 co = stack_load_float3(stack, co_offset);
|
float3 co = stack_load_float3(stack, co_offset);
|
||||||
float2 uv;
|
float2 uv;
|
||||||
|
|
||||||
co = normalize(co);
|
co = safe_normalize(co);
|
||||||
|
|
||||||
if(projection == 0)
|
if(projection == 0)
|
||||||
uv = direction_to_equirectangular(co);
|
uv = direction_to_equirectangular(co);
|
||||||
|
@@ -402,7 +402,6 @@ typedef enum ClosureType {
|
|||||||
CLOSURE_BSDF_DIFFUSE_TOON_ID,
|
CLOSURE_BSDF_DIFFUSE_TOON_ID,
|
||||||
|
|
||||||
/* Glossy */
|
/* Glossy */
|
||||||
CLOSURE_BSDF_GLOSSY_ID,
|
|
||||||
CLOSURE_BSDF_REFLECTION_ID,
|
CLOSURE_BSDF_REFLECTION_ID,
|
||||||
CLOSURE_BSDF_MICROFACET_GGX_ID,
|
CLOSURE_BSDF_MICROFACET_GGX_ID,
|
||||||
CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
|
CLOSURE_BSDF_MICROFACET_GGX_FRESNEL_ID,
|
||||||
@@ -423,14 +422,13 @@ typedef enum ClosureType {
|
|||||||
CLOSURE_BSDF_HAIR_REFLECTION_ID,
|
CLOSURE_BSDF_HAIR_REFLECTION_ID,
|
||||||
|
|
||||||
/* Transmission */
|
/* Transmission */
|
||||||
CLOSURE_BSDF_TRANSMISSION_ID,
|
|
||||||
CLOSURE_BSDF_TRANSLUCENT_ID,
|
CLOSURE_BSDF_TRANSLUCENT_ID,
|
||||||
CLOSURE_BSDF_REFRACTION_ID,
|
CLOSURE_BSDF_REFRACTION_ID,
|
||||||
CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
|
CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID,
|
||||||
CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
|
CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID,
|
||||||
|
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
|
||||||
CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
|
CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID,
|
||||||
CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
|
CLOSURE_BSDF_MICROFACET_GGX_GLASS_ID,
|
||||||
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID,
|
|
||||||
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
|
CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_FRESNEL_ID,
|
||||||
CLOSURE_BSDF_SHARP_GLASS_ID,
|
CLOSURE_BSDF_SHARP_GLASS_ID,
|
||||||
CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
|
CLOSURE_BSDF_HAIR_TRANSMISSION_ID,
|
||||||
@@ -465,13 +463,16 @@ typedef enum ClosureType {
|
|||||||
/* watch this, being lazy with memory usage */
|
/* watch this, being lazy with memory usage */
|
||||||
#define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
|
#define CLOSURE_IS_BSDF(type) (type <= CLOSURE_BSDF_TRANSPARENT_ID)
|
||||||
#define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
|
#define CLOSURE_IS_BSDF_DIFFUSE(type) (type >= CLOSURE_BSDF_DIFFUSE_ID && type <= CLOSURE_BSDF_DIFFUSE_TOON_ID)
|
||||||
#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_GLOSSY_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
|
#define CLOSURE_IS_BSDF_GLOSSY(type) (type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_HAIR_REFLECTION_ID)
|
||||||
#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSMISSION_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
|
#define CLOSURE_IS_BSDF_TRANSMISSION(type) (type >= CLOSURE_BSDF_TRANSLUCENT_ID && type <= CLOSURE_BSDF_HAIR_TRANSMISSION_ID)
|
||||||
#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
|
#define CLOSURE_IS_BSDF_BSSRDF(type) (type == CLOSURE_BSDF_BSSRDF_ID || type == CLOSURE_BSDF_BSSRDF_PRINCIPLED_ID)
|
||||||
|
#define CLOSURE_IS_BSDF_TRANSPARENT(type) (type == CLOSURE_BSDF_TRANSPARENT_ID)
|
||||||
#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
|
#define CLOSURE_IS_BSDF_ANISOTROPIC(type) (type >= CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID)
|
||||||
#define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
|
#define CLOSURE_IS_BSDF_MULTISCATTER(type) (type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ID ||\
|
||||||
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
|
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_ANISO_ID || \
|
||||||
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
|
type == CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID)
|
||||||
|
#define CLOSURE_IS_BSDF_MICROFACET(type) ((type >= CLOSURE_BSDF_REFLECTION_ID && type <= CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID) ||\
|
||||||
|
(type >= CLOSURE_BSDF_REFRACTION_ID && type <= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID))
|
||||||
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
|
#define CLOSURE_IS_BSDF_OR_BSSRDF(type) (type <= CLOSURE_BSSRDF_BURLEY_ID)
|
||||||
#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
|
#define CLOSURE_IS_BSSRDF(type) (type >= CLOSURE_BSSRDF_CUBIC_ID && type <= CLOSURE_BSSRDF_BURLEY_ID)
|
||||||
#define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
|
#define CLOSURE_IS_VOLUME(type) (type >= CLOSURE_VOLUME_ID && type <= CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
|
||||||
@@ -480,7 +481,7 @@ typedef enum ClosureType {
|
|||||||
#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
|
#define CLOSURE_IS_BACKGROUND(type) (type == CLOSURE_BACKGROUND_ID)
|
||||||
#define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
|
#define CLOSURE_IS_AMBIENT_OCCLUSION(type) (type == CLOSURE_AMBIENT_OCCLUSION_ID)
|
||||||
#define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
|
#define CLOSURE_IS_PHASE(type) (type == CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID)
|
||||||
#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_BECKMANN_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
|
#define CLOSURE_IS_GLASS(type) (type >= CLOSURE_BSDF_MICROFACET_MULTI_GGX_GLASS_ID && type <= CLOSURE_BSDF_SHARP_GLASS_ID)
|
||||||
#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
|
#define CLOSURE_IS_PRINCIPLED(type) (type == CLOSURE_BSDF_PRINCIPLED_ID)
|
||||||
|
|
||||||
#define CLOSURE_WEIGHT_CUTOFF 1e-5f
|
#define CLOSURE_WEIGHT_CUTOFF 1e-5f
|
||||||
|
@@ -42,6 +42,9 @@ BufferParams::BufferParams()
|
|||||||
full_width = 0;
|
full_width = 0;
|
||||||
full_height = 0;
|
full_height = 0;
|
||||||
|
|
||||||
|
denoising_data_pass = false;
|
||||||
|
denoising_clean_pass = false;
|
||||||
|
|
||||||
Pass::add(PASS_COMBINED, passes);
|
Pass::add(PASS_COMBINED, passes);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -69,9 +72,24 @@ int BufferParams::get_passes_size()
|
|||||||
for(size_t i = 0; i < passes.size(); i++)
|
for(size_t i = 0; i < passes.size(); i++)
|
||||||
size += passes[i].components;
|
size += passes[i].components;
|
||||||
|
|
||||||
|
if(denoising_data_pass) {
|
||||||
|
size += DENOISING_PASS_SIZE_BASE;
|
||||||
|
if(denoising_clean_pass) size += DENOISING_PASS_SIZE_CLEAN;
|
||||||
|
}
|
||||||
|
|
||||||
return align_up(size, 4);
|
return align_up(size, 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int BufferParams::get_denoising_offset()
|
||||||
|
{
|
||||||
|
int offset = 0;
|
||||||
|
|
||||||
|
for(size_t i = 0; i < passes.size(); i++)
|
||||||
|
offset += passes[i].components;
|
||||||
|
|
||||||
|
return offset;
|
||||||
|
}
|
||||||
|
|
||||||
/* Render Buffer Task */
|
/* Render Buffer Task */
|
||||||
|
|
||||||
RenderTile::RenderTile()
|
RenderTile::RenderTile()
|
||||||
@@ -138,12 +156,51 @@ void RenderBuffers::reset(Device *device, BufferParams& params_)
|
|||||||
device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
|
device->mem_alloc("rng_state", rng_state, MEM_READ_WRITE);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool RenderBuffers::copy_from_device()
|
bool RenderBuffers::copy_from_device(Device *from_device)
|
||||||
{
|
{
|
||||||
if(!buffer.device_pointer)
|
if(!buffer.device_pointer)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
|
if(!from_device) {
|
||||||
|
from_device = device;
|
||||||
|
}
|
||||||
|
|
||||||
|
from_device->mem_copy_from(buffer, 0, params.width, params.height, params.get_passes_size()*sizeof(float));
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool RenderBuffers::get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels)
|
||||||
|
{
|
||||||
|
float scale = 1.0f/sample;
|
||||||
|
|
||||||
|
if(offset == DENOISING_PASS_COLOR) {
|
||||||
|
scale *= exposure;
|
||||||
|
}
|
||||||
|
else if(offset == DENOISING_PASS_COLOR_VAR) {
|
||||||
|
scale *= exposure*exposure;
|
||||||
|
}
|
||||||
|
|
||||||
|
offset += params.get_denoising_offset();
|
||||||
|
float *in = (float*)buffer.data_pointer + offset;
|
||||||
|
int pass_stride = params.get_passes_size();
|
||||||
|
int size = params.width*params.height;
|
||||||
|
|
||||||
|
if(components == 1) {
|
||||||
|
for(int i = 0; i < size; i++, in += pass_stride, pixels++) {
|
||||||
|
pixels[0] = in[0]*scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if(components == 3) {
|
||||||
|
for(int i = 0; i < size; i++, in += pass_stride, pixels += 3) {
|
||||||
|
pixels[0] = in[0]*scale;
|
||||||
|
pixels[1] = in[1]*scale;
|
||||||
|
pixels[2] = in[2]*scale;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@@ -51,6 +51,9 @@ public:
|
|||||||
|
|
||||||
/* passes */
|
/* passes */
|
||||||
array<Pass> passes;
|
array<Pass> passes;
|
||||||
|
bool denoising_data_pass;
|
||||||
|
/* If only some light path types should be denoised, an additional pass is needed. */
|
||||||
|
bool denoising_clean_pass;
|
||||||
|
|
||||||
/* functions */
|
/* functions */
|
||||||
BufferParams();
|
BufferParams();
|
||||||
@@ -59,6 +62,7 @@ public:
|
|||||||
bool modified(const BufferParams& params);
|
bool modified(const BufferParams& params);
|
||||||
void add_pass(PassType type);
|
void add_pass(PassType type);
|
||||||
int get_passes_size();
|
int get_passes_size();
|
||||||
|
int get_denoising_offset();
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Render Buffers */
|
/* Render Buffers */
|
||||||
@@ -73,18 +77,19 @@ public:
|
|||||||
/* random number generator state */
|
/* random number generator state */
|
||||||
device_vector<uint> rng_state;
|
device_vector<uint> rng_state;
|
||||||
|
|
||||||
|
Device *device;
|
||||||
|
|
||||||
explicit RenderBuffers(Device *device);
|
explicit RenderBuffers(Device *device);
|
||||||
~RenderBuffers();
|
~RenderBuffers();
|
||||||
|
|
||||||
void reset(Device *device, BufferParams& params);
|
void reset(Device *device, BufferParams& params);
|
||||||
|
|
||||||
bool copy_from_device();
|
bool copy_from_device(Device *from_device = NULL);
|
||||||
bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
|
bool get_pass_rect(PassType type, float exposure, int sample, int components, float *pixels);
|
||||||
|
bool get_denoising_pass_rect(int offset, float exposure, int sample, int components, float *pixels);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void device_free();
|
void device_free();
|
||||||
|
|
||||||
Device *device;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Display Buffer
|
/* Display Buffer
|
||||||
@@ -131,6 +136,9 @@ protected:
|
|||||||
|
|
||||||
class RenderTile {
|
class RenderTile {
|
||||||
public:
|
public:
|
||||||
|
typedef enum { PATH_TRACE, DENOISE } Task;
|
||||||
|
|
||||||
|
Task task;
|
||||||
int x, y, w, h;
|
int x, y, w, h;
|
||||||
int start_sample;
|
int start_sample;
|
||||||
int num_samples;
|
int num_samples;
|
||||||
@@ -138,6 +146,7 @@ public:
|
|||||||
int resolution;
|
int resolution;
|
||||||
int offset;
|
int offset;
|
||||||
int stride;
|
int stride;
|
||||||
|
int tile_index;
|
||||||
|
|
||||||
device_ptr buffer;
|
device_ptr buffer;
|
||||||
device_ptr rng_state;
|
device_ptr rng_state;
|
||||||
|
@@ -279,6 +279,10 @@ NODE_DEFINE(Film)
|
|||||||
|
|
||||||
SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);
|
SOCKET_BOOLEAN(use_sample_clamp, "Use Sample Clamp", false);
|
||||||
|
|
||||||
|
SOCKET_BOOLEAN(denoising_data_pass, "Generate Denoising Data Pass", false);
|
||||||
|
SOCKET_BOOLEAN(denoising_clean_pass, "Generate Denoising Clean Pass", false);
|
||||||
|
SOCKET_INT(denoising_flags, "Denoising Flags", 0);
|
||||||
|
|
||||||
return type;
|
return type;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -437,6 +441,20 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
|
|||||||
kfilm->pass_stride += pass.components;
|
kfilm->pass_stride += pass.components;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kfilm->pass_denoising_data = 0;
|
||||||
|
kfilm->pass_denoising_clean = 0;
|
||||||
|
kfilm->denoising_flags = 0;
|
||||||
|
if(denoising_data_pass) {
|
||||||
|
kfilm->pass_denoising_data = kfilm->pass_stride;
|
||||||
|
kfilm->pass_stride += DENOISING_PASS_SIZE_BASE;
|
||||||
|
kfilm->denoising_flags = denoising_flags;
|
||||||
|
if(denoising_clean_pass) {
|
||||||
|
kfilm->pass_denoising_clean = kfilm->pass_stride;
|
||||||
|
kfilm->pass_stride += DENOISING_PASS_SIZE_CLEAN;
|
||||||
|
kfilm->use_light_pass = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
|
kfilm->pass_stride = align_up(kfilm->pass_stride, 4);
|
||||||
kfilm->pass_alpha_threshold = pass_alpha_threshold;
|
kfilm->pass_alpha_threshold = pass_alpha_threshold;
|
||||||
|
|
||||||
@@ -451,6 +469,10 @@ void Film::device_update(Device *device, DeviceScene *dscene, Scene *scene)
|
|||||||
kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
|
kfilm->mist_inv_depth = (mist_depth > 0.0f)? 1.0f/mist_depth: 0.0f;
|
||||||
kfilm->mist_falloff = mist_falloff;
|
kfilm->mist_falloff = mist_falloff;
|
||||||
|
|
||||||
|
pass_stride = kfilm->pass_stride;
|
||||||
|
denoising_data_offset = kfilm->pass_denoising_data;
|
||||||
|
denoising_clean_offset = kfilm->pass_denoising_clean;
|
||||||
|
|
||||||
need_update = false;
|
need_update = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user