Cycles: oneAPI: use local memory for faster shader sorting #107994

Merged
Xavier Hallade merged 2 commits from Sirgienko/blender:oneapi_local_memory_sorting into main 2023-05-17 11:08:10 +02:00
12 changed files with 177 additions and 21 deletions

View File

@ -262,6 +262,11 @@ string OneapiDevice::oneapi_error_message()
return string(oneapi_error_string_);
}
int OneapiDevice::scene_max_shaders()
{
return scene_max_shaders_;
}
void *OneapiDevice::kernel_globals_device_pointer()
{
return kg_memory_device_;
@ -436,6 +441,9 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
/* Update scene handle(since it is different for each device on multi devices) */
KernelData *const data = (KernelData *)host;
data->device_bvh = embree_scene;
/* We need this number later for proper local memory allocation. */
scene_max_shaders_ = data->max_shaders;
}
# endif

View File

@ -37,6 +37,7 @@ class OneapiDevice : public Device {
std::string oneapi_error_string_;
bool use_hardware_raytracing = false;
unsigned int kernel_features = 0;
int scene_max_shaders_ = 0;
public:
virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override;
@ -61,6 +62,8 @@ class OneapiDevice : public Device {
string oneapi_error_message();
int scene_max_shaders();
void *kernel_globals_device_pointer();
void mem_alloc(device_memory &mem) override;

View File

@ -59,7 +59,7 @@ void OneapiDeviceQueue::init_execution()
void *kg_dptr = (void *)oneapi_device_->kernel_globals_device_pointer();
assert(device_queue);
assert(kg_dptr);
kernel_context_ = new KernelContext{device_queue, kg_dptr};
kernel_context_ = new KernelContext{device_queue, kg_dptr, 0};
debug_init_execution();
}
@ -78,12 +78,13 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
assert(signed_kernel_work_size >= 0);
size_t kernel_work_size = (size_t)signed_kernel_work_size;
assert(kernel_context_);
kernel_context_->scene_max_shaders = oneapi_device_->scene_max_shaders();
size_t kernel_local_size = oneapi_kernel_preferred_local_size(
kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);
assert(kernel_context_);
/* Call the oneAPI kernel DLL to launch the requested kernel. */
bool is_finished_ok = oneapi_device_->enqueue_kernel(
kernel_context_, kernel, uniformed_kernel_work_size, args);

View File

@ -39,6 +39,11 @@ class OneapiDeviceQueue : public DeviceQueue {
virtual void copy_to_device(device_memory &mem) override;
virtual void copy_from_device(device_memory &mem) override;
virtual bool supports_local_atomic_sort() const
{
return true;
}
protected:
OneapiDevice *oneapi_device_;
KernelContext *kernel_context_;

View File

@ -385,11 +385,17 @@ void PathTraceWorkGPU::enqueue_reset()
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
queue_->zero_to_device(integrator_queue_counter_);
queue_->zero_to_device(integrator_shader_sort_counter_);
if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
if (integrator_shader_sort_counter_.size() != 0) {
queue_->zero_to_device(integrator_shader_sort_counter_);
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE &&
integrator_shader_raytrace_sort_counter_.size() != 0)
{
queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
}
if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE &&
integrator_shader_mnee_sort_counter_.size() != 0)
{
queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
}

View File

@ -847,6 +847,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
-DWITH_ONEAPI
-ffast-math
-O2
-D__KERNEL_LOCAL_ATOMIC_SORT__
-o"${cycles_kernel_oneapi_lib}"
-I"${CMAKE_CURRENT_SOURCE_DIR}/.."
${SYCL_CPP_FLAGS}

View File

@ -432,6 +432,17 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
}
ccl_gpu_kernel_postfix
/* oneAPI verion needs the local_mem accessor in the arguments. */
#ifdef __KERNEL_ONEAPI__
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
int num_states,
int partition_size,
int num_states_limit,
ccl_global int *indices,
int kernel_index,
sycl::local_accessor<int> &local_mem)
#else
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
int num_states,
@ -439,9 +450,9 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
int num_states_limit,
ccl_global int *indices,
int kernel_index)
#endif
{
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
int max_shaders = context.launch_params_metal.data.max_shaders;
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
kernel_integrator_state.path.queued_kernel;
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
@ -449,6 +460,20 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_global int *key_offsets = (ccl_global int *)
kernel_integrator_state.sort_partition_key_offsets;
# ifdef __KERNEL_METAL__
int max_shaders = context.launch_params_metal.data.max_shaders;
# endif
# ifdef __KERNEL_ONEAPI__
/* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
* uses metal_*, we need the below to be compatible with these kernels. */
int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
int metal_local_id = ccl_gpu_thread_idx_x;
int metal_local_size = ccl_gpu_block_dim_x;
int metal_grid_id = ccl_gpu_block_idx_x;
ccl_gpu_shared int *threadgroup_array = local_mem.get_pointer();
# endif
gpu_parallel_sort_bucket_pass(num_states,
partition_size,
max_shaders,
@ -456,7 +481,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
d_queued_kernel,
d_shader_sort_key,
key_offsets,
(threadgroup int *)threadgroup_array,
(ccl_gpu_shared int *)threadgroup_array,
metal_local_id,
metal_local_size,
metal_grid_id);
@ -464,6 +489,17 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
}
ccl_gpu_kernel_postfix
/* oneAPI verion needs the local_mem accessor in the arguments. */
#ifdef __KERNEL_ONEAPI__
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_sort_write_pass,
int num_states,
int partition_size,
int num_states_limit,
ccl_global int *indices,
int kernel_index,
sycl::local_accessor<int> &local_mem)
#else
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_gpu_kernel_signature(integrator_sort_write_pass,
int num_states,
@ -471,9 +507,10 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
int num_states_limit,
ccl_global int *indices,
int kernel_index)
#endif
{
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
int max_shaders = context.launch_params_metal.data.max_shaders;
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
kernel_integrator_state.path.queued_kernel;
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
@ -481,6 +518,20 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
ccl_global int *key_offsets = (ccl_global int *)
kernel_integrator_state.sort_partition_key_offsets;
# ifdef __KERNEL_METAL__
int max_shaders = context.launch_params_metal.data.max_shaders;
# endif
# ifdef __KERNEL_ONEAPI__
/* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
* uses metal_*, we need the below to be compatible with these kernels. */
int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
int metal_local_id = ccl_gpu_thread_idx_x;
int metal_local_size = ccl_gpu_block_dim_x;
int metal_grid_id = ccl_gpu_block_idx_x;
ccl_gpu_shared int *threadgroup_array = local_mem.get_pointer();
# endif
gpu_parallel_sort_write_pass(num_states,
partition_size,
max_shaders,
@ -490,7 +541,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
d_queued_kernel,
d_shader_sort_key,
key_offsets,
(threadgroup int *)threadgroup_array,
(ccl_gpu_shared int *)threadgroup_array,
metal_local_id,
metal_local_size,
metal_grid_id);

View File

@ -23,11 +23,6 @@ CCL_NAMESPACE_BEGIN
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
# define atomic_store_local(p, x) \
atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
# define atomic_load_local(p) \
atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
const uint partition_size,
const uint max_shaders,
@ -45,7 +40,13 @@ ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
atomic_store_local(&buckets[local_id], 0);
}
# ifdef __KERNEL_ONEAPI__
/* NOTE(@nsirgien): For us here only local memory writing (buckets) is important,
* so faster local barriers can be used. */
ccl_gpu_local_syncthreads();
# else
ccl_gpu_syncthreads();
# endif
/* Determine bucket sizes within the partitions. */
@ -58,11 +59,17 @@ ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
ushort kernel_index = d_queued_kernel[state_index];
if (kernel_index == queued_kernel) {
uint key = d_shader_sort_key[state_index] % max_shaders;
atomic_fetch_and_add_uint32(&buckets[key], 1);
atomic_fetch_and_add_uint32_shared(&buckets[key], 1);
}
}
# ifdef __KERNEL_ONEAPI__
/* NOTE(@nsirgien): For us here only local memory writing (buckets) is important,
* so faster local barriers can be used. */
ccl_gpu_local_syncthreads();
# else
ccl_gpu_syncthreads();
# endif
/* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
@ -106,7 +113,13 @@ ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
}
# ifdef __KERNEL_ONEAPI__
/* NOTE(@nsirgien): For us here only local memory writing (local_offset) is important,
* so faster local barriers can be used. */
ccl_gpu_local_syncthreads();
# else
ccl_gpu_syncthreads();
# endif
/* Write the sorted active indices. */
@ -121,7 +134,7 @@ ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
ushort kernel_index = d_queued_kernel[state_index];
if (kernel_index == queued_kernel) {
uint key = d_shader_sort_key[state_index] % max_shaders;
int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
int index = atomic_fetch_and_add_uint32_shared(&local_offset[key], 1);
if (index < num_states_limit) {
indices[index] = state_index;
}

View File

@ -48,6 +48,7 @@
#define ccl_loop_no_unroll
#define ccl_optional_struct_init
#define ccl_private
#define ccl_gpu_shared
#define ATTR_FALLTHROUGH __attribute__((fallthrough))
#define ccl_constant const
#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))

View File

@ -367,6 +367,14 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
# pragma GCC diagnostic error "-Wswitch"
# endif
int max_shaders = 0;
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS)
{
max_shaders = (kernel_context->scene_max_shaders);
}
try {
queue->submit([&](sycl::handler &cgh) {
# ifdef WITH_EMBREE_GPU
@ -509,13 +517,31 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
break;
}
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
sycl::local_accessor<int> local_mem(max_shaders, cgh);
oneapi_kernel_integrator_sort_bucket_pass(kg,
global_size,
local_size,
cgh,
*(int *)(args[0]),
*(int *)(args[1]),
*(int *)(args[2]),
*(int **)(args[3]),
*(int *)(args[4]),
local_mem);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
sycl::local_accessor<int> local_mem(max_shaders, cgh);
oneapi_kernel_integrator_sort_write_pass(kg,
global_size,
local_size,
cgh,
*(int *)(args[0]),
*(int *)(args[1]),
*(int *)(args[2]),
*(int **)(args[3]),
*(int *)(args[4]),
local_mem);
break;
}
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {

View File

@ -32,6 +32,8 @@ struct KernelContext {
SyclQueue *queue;
/* Pointer to USM device memory with all global/constant allocation on this device */
void *kernel_globals;
/* We needs this additional data for some kernels. */
int scene_max_shaders;
};
/* Use extern C linking so that the symbols can be easily load from the dynamic library at runtime.

View File

@ -21,6 +21,10 @@
#else /* __KERNEL_GPU__ */
# ifndef __KERNEL_ONEAPI__
# define atomic_fetch_and_add_uint32_shared atomic_fetch_and_add_uint32
# endif
# if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
# define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
@ -140,6 +144,11 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
# define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
# define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)
# define atomic_store_local(p, x) \
atomic_store_explicit((ccl_gpu_shared atomic_int *)p, x, memory_order_relaxed)
# define atomic_load_local(p) \
atomic_load_explicit((ccl_gpu_shared atomic_int *)p, memory_order_relaxed)
# define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
# define ccl_barrier(flags) threadgroup_barrier(flags)
@ -191,6 +200,16 @@ ccl_device_inline int atomic_fetch_and_add_uint32(ccl_global int *p, int x)
return atomic.fetch_add(x);
}
ccl_device_inline int atomic_fetch_and_add_uint32_shared(int *p, int x)
{
sycl::atomic_ref<int,
sycl::memory_order::relaxed,
sycl::memory_scope::device,
sycl::access::address_space::local_space>
atomic(*p);
return atomic.fetch_add(x);
}
ccl_device_inline unsigned int atomic_fetch_and_sub_uint32(ccl_global unsigned int *p,
unsigned int x)
{
@ -253,6 +272,26 @@ ccl_device_inline int atomic_fetch_and_or_uint32(ccl_global int *p, int x)
return atomic.fetch_or(x);
}
ccl_device_inline void atomic_store_local(int *p, int x)
{
sycl::atomic_ref<int,
sycl::memory_order::relaxed,
sycl::memory_scope::device,
sycl::access::address_space::local_space>
atomic(*p);
atomic.store(x);
}
ccl_device_inline int atomic_load_local(int *p)
{
sycl::atomic_ref<int,
sycl::memory_order::relaxed,
sycl::memory_scope::device,
sycl::access::address_space::local_space>
atomic(*p);
return atomic.load();
}
# endif /* __KERNEL_ONEAPI__ */
#endif /* __KERNEL_GPU__ */