Cycles: oneAPI: Improve performance of scenes not using volume #109245

Closed
Nikita Sirgienko wants to merge 2 commits from Sirgienko/blender:oneapi_shade_surface_perf_optimization into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
5 changed files with 57 additions and 2 deletions

View File

@ -279,6 +279,27 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel_postfix
#ifdef __KERNEL_ONEAPI__
/* The NODE_VOLUME feature adds a lot of code, including the entire NanoVDB library,
* causing GPU compilers to spend additional registers while the feature is not used often,
* leading to suboptimal execution.
* The use of a specialized version below gives a noticeable speed-up for oneAPI execution. */
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
ccl_gpu_kernel_signature(integrator_shade_surface_no_volume,
ccl_global const int *path_index_array,
ccl_global float *render_buffer,
const int work_size)
{
const int global_index = ccl_gpu_global_id_x();
if (ccl_gpu_kernel_within_bounds(global_index, work_size)) {
const int state = (path_index_array) ? path_index_array[global_index] : global_index;
ccl_gpu_kernel_call(integrator_shade_surface_no_volume(NULL, state, render_buffer));
}
}
ccl_gpu_kernel_postfix
#endif
#if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
#endif

View File

@ -463,8 +463,18 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface);
if (kernel_features & KERNEL_FEATURE_NODE_VOLUME) {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface);
}
else {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_shade_surface_no_volume);
}
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: {

View File

@ -799,6 +799,15 @@ ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg,
integrator_shade_surface_next_kernel<current_kernel>(kg, state);
}
#ifdef __KERNEL_ONEAPI__
ccl_device_forceinline void integrator_shade_surface_no_volume(
KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
{
integrator_shade_surface<KERNEL_FEATURE_NODE_MASK_SURFACE & ~KERNEL_FEATURE_NODE_RAYTRACE &
~KERNEL_FEATURE_NODE_VOLUME>(kg, state, render_buffer);
}
#endif
ccl_device_forceinline void integrator_shade_surface_raytrace(
KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
{

View File

@ -1944,6 +1944,19 @@ void PointDensityTextureNode::compile(OSLCompiler &compiler)
}
}
int PointDensityTextureNode::get_feature()
{
ShaderOutput *density_out = output("Density");
ShaderOutput *color_out = output("Color");
const bool use_density = !density_out->links.empty();
const bool use_color = !color_out->links.empty();
/* NOTE: A need for NODE_VOLUME feature is conditional and based on
* the coresponding logic in ::compile implementation. */
return ShaderNode::get_feature() | ((use_density || use_color) ? KERNEL_FEATURE_NODE_VOLUME : 0);
}
/* Normal */
NODE_DEFINE(NormalNode)

View File

@ -346,6 +346,8 @@ class PointDensityTextureNode : public ShaderNode {
public:
SHADER_NODE_NO_CLONE_CLASS(PointDensityTextureNode)
virtual int get_feature();
~PointDensityTextureNode();
ShaderNode *clone(ShaderGraph *graph) const;
void attributes(Shader *shader, AttributeRequestSet *attributes);