Cycles: oneAPI: use local memory for faster shader sorting #107994
|
@ -262,6 +262,11 @@ string OneapiDevice::oneapi_error_message()
|
|||
return string(oneapi_error_string_);
|
||||
}
|
||||
|
||||
int OneapiDevice::scene_max_shaders()
|
||||
{
|
||||
return scene_max_shaders_;
|
||||
}
|
||||
|
||||
void *OneapiDevice::kernel_globals_device_pointer()
|
||||
{
|
||||
return kg_memory_device_;
|
||||
|
@ -436,6 +441,9 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
|
|||
/* Update scene handle(since it is different for each device on multi devices) */
|
||||
KernelData *const data = (KernelData *)host;
|
||||
data->device_bvh = embree_scene;
|
||||
|
||||
/* We need this number later for proper local memory allocation. */
|
||||
scene_max_shaders_ = data->max_shaders;
|
||||
}
|
||||
# endif
|
||||
|
||||
|
|
|
@ -37,6 +37,7 @@ class OneapiDevice : public Device {
|
|||
std::string oneapi_error_string_;
|
||||
bool use_hardware_raytracing = false;
|
||||
unsigned int kernel_features = 0;
|
||||
int scene_max_shaders_ = 0;
|
||||
|
||||
public:
|
||||
virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override;
|
||||
|
@ -61,6 +62,8 @@ class OneapiDevice : public Device {
|
|||
|
||||
string oneapi_error_message();
|
||||
|
||||
int scene_max_shaders();
|
||||
|
||||
void *kernel_globals_device_pointer();
|
||||
|
||||
void mem_alloc(device_memory &mem) override;
|
||||
|
|
|
@ -59,7 +59,7 @@ void OneapiDeviceQueue::init_execution()
|
|||
void *kg_dptr = (void *)oneapi_device_->kernel_globals_device_pointer();
|
||||
assert(device_queue);
|
||||
assert(kg_dptr);
|
||||
kernel_context_ = new KernelContext{device_queue, kg_dptr};
|
||||
kernel_context_ = new KernelContext{device_queue, kg_dptr, 0};
|
||||
|
||||
debug_init_execution();
|
||||
}
|
||||
|
@ -78,12 +78,13 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
|
|||
assert(signed_kernel_work_size >= 0);
|
||||
size_t kernel_work_size = (size_t)signed_kernel_work_size;
|
||||
|
||||
assert(kernel_context_);
|
||||
kernel_context_->scene_max_shaders = oneapi_device_->scene_max_shaders();
|
||||
|
||||
size_t kernel_local_size = oneapi_kernel_preferred_local_size(
|
||||
kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
|
||||
size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);
|
||||
|
||||
assert(kernel_context_);
|
||||
|
||||
/* Call the oneAPI kernel DLL to launch the requested kernel. */
|
||||
bool is_finished_ok = oneapi_device_->enqueue_kernel(
|
||||
kernel_context_, kernel, uniformed_kernel_work_size, args);
|
||||
|
|
|
@ -39,6 +39,11 @@ class OneapiDeviceQueue : public DeviceQueue {
|
|||
virtual void copy_to_device(device_memory &mem) override;
|
||||
virtual void copy_from_device(device_memory &mem) override;
|
||||
|
||||
virtual bool supports_local_atomic_sort() const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
OneapiDevice *oneapi_device_;
|
||||
KernelContext *kernel_context_;
|
||||
|
|
|
@ -385,11 +385,17 @@ void PathTraceWorkGPU::enqueue_reset()
|
|||
|
||||
queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
|
||||
queue_->zero_to_device(integrator_queue_counter_);
|
||||
queue_->zero_to_device(integrator_shader_sort_counter_);
|
||||
if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
|
||||
if (integrator_shader_sort_counter_.size() != 0) {
|
||||
queue_->zero_to_device(integrator_shader_sort_counter_);
|
||||
}
|
||||
if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE &&
|
||||
integrator_shader_raytrace_sort_counter_.size() != 0)
|
||||
{
|
||||
queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
|
||||
}
|
||||
if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
|
||||
if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE &&
|
||||
integrator_shader_mnee_sort_counter_.size() != 0)
|
||||
{
|
||||
queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
|
||||
}
|
||||
|
||||
|
|
|
@ -847,6 +847,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
|
|||
-DWITH_ONEAPI
|
||||
-ffast-math
|
||||
-O2
|
||||
-D__KERNEL_LOCAL_ATOMIC_SORT__
|
||||
-o"${cycles_kernel_oneapi_lib}"
|
||||
-I"${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||
${SYCL_CPP_FLAGS}
|
||||
|
|
|
@ -432,6 +432,17 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
|
|||
}
|
||||
ccl_gpu_kernel_postfix
|
||||
|
||||
/* oneAPI verion needs the local_mem accessor in the arguments. */
|
||||
#ifdef __KERNEL_ONEAPI__
|
||||
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
||||
ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
|
||||
int num_states,
|
||||
int partition_size,
|
||||
int num_states_limit,
|
||||
ccl_global int *indices,
|
||||
int kernel_index,
|
||||
sycl::local_accessor<int> &local_mem)
|
||||
#else
|
||||
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
||||
ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
|
||||
int num_states,
|
||||
|
@ -439,9 +450,9 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|||
int num_states_limit,
|
||||
ccl_global int *indices,
|
||||
int kernel_index)
|
||||
#endif
|
||||
{
|
||||
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
|
||||
int max_shaders = context.launch_params_metal.data.max_shaders;
|
||||
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
|
||||
kernel_integrator_state.path.queued_kernel;
|
||||
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
|
||||
|
@ -449,6 +460,20 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|||
ccl_global int *key_offsets = (ccl_global int *)
|
||||
kernel_integrator_state.sort_partition_key_offsets;
|
||||
|
||||
# ifdef __KERNEL_METAL__
|
||||
int max_shaders = context.launch_params_metal.data.max_shaders;
|
||||
# endif
|
||||
|
||||
# ifdef __KERNEL_ONEAPI__
|
||||
/* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
|
||||
* uses metal_*, we need the below to be compatible with these kernels. */
|
||||
int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
|
||||
int metal_local_id = ccl_gpu_thread_idx_x;
|
||||
int metal_local_size = ccl_gpu_block_dim_x;
|
||||
int metal_grid_id = ccl_gpu_block_idx_x;
|
||||
ccl_gpu_shared int *threadgroup_array = local_mem.get_pointer();
|
||||
# endif
|
||||
|
||||
gpu_parallel_sort_bucket_pass(num_states,
|
||||
partition_size,
|
||||
max_shaders,
|
||||
|
@ -456,7 +481,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|||
d_queued_kernel,
|
||||
d_shader_sort_key,
|
||||
key_offsets,
|
||||
(threadgroup int *)threadgroup_array,
|
||||
(ccl_gpu_shared int *)threadgroup_array,
|
||||
metal_local_id,
|
||||
metal_local_size,
|
||||
metal_grid_id);
|
||||
|
@ -464,6 +489,17 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|||
}
|
||||
ccl_gpu_kernel_postfix
|
||||
|
||||
/* oneAPI verion needs the local_mem accessor in the arguments. */
|
||||
#ifdef __KERNEL_ONEAPI__
|
||||
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
||||
ccl_gpu_kernel_signature(integrator_sort_write_pass,
|
||||
int num_states,
|
||||
int partition_size,
|
||||
int num_states_limit,
|
||||
ccl_global int *indices,
|
||||
int kernel_index,
|
||||
sycl::local_accessor<int> &local_mem)
|
||||
#else
|
||||
ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
||||
ccl_gpu_kernel_signature(integrator_sort_write_pass,
|
||||
int num_states,
|
||||
|
@ -471,9 +507,10 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|||
int num_states_limit,
|
||||
ccl_global int *indices,
|
||||
int kernel_index)
|
||||
#endif
|
||||
|
||||
{
|
||||
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
|
||||
int max_shaders = context.launch_params_metal.data.max_shaders;
|
||||
ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
|
||||
kernel_integrator_state.path.queued_kernel;
|
||||
ccl_global uint *d_shader_sort_key = (ccl_global uint *)
|
||||
|
@ -481,6 +518,20 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|||
ccl_global int *key_offsets = (ccl_global int *)
|
||||
kernel_integrator_state.sort_partition_key_offsets;
|
||||
|
||||
# ifdef __KERNEL_METAL__
|
||||
int max_shaders = context.launch_params_metal.data.max_shaders;
|
||||
# endif
|
||||
|
||||
# ifdef __KERNEL_ONEAPI__
|
||||
/* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
|
||||
* uses metal_*, we need the below to be compatible with these kernels. */
|
||||
int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
|
||||
int metal_local_id = ccl_gpu_thread_idx_x;
|
||||
int metal_local_size = ccl_gpu_block_dim_x;
|
||||
int metal_grid_id = ccl_gpu_block_idx_x;
|
||||
ccl_gpu_shared int *threadgroup_array = local_mem.get_pointer();
|
||||
# endif
|
||||
|
||||
gpu_parallel_sort_write_pass(num_states,
|
||||
partition_size,
|
||||
max_shaders,
|
||||
|
@ -490,7 +541,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
|
|||
d_queued_kernel,
|
||||
d_shader_sort_key,
|
||||
key_offsets,
|
||||
(threadgroup int *)threadgroup_array,
|
||||
(ccl_gpu_shared int *)threadgroup_array,
|
||||
metal_local_id,
|
||||
metal_local_size,
|
||||
metal_grid_id);
|
||||
|
|
|
@ -23,11 +23,6 @@ CCL_NAMESPACE_BEGIN
|
|||
|
||||
#if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
|
||||
|
||||
# define atomic_store_local(p, x) \
|
||||
atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
|
||||
# define atomic_load_local(p) \
|
||||
atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
|
||||
|
||||
ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
|
||||
const uint partition_size,
|
||||
const uint max_shaders,
|
||||
|
@ -45,7 +40,13 @@ ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
|
|||
atomic_store_local(&buckets[local_id], 0);
|
||||
}
|
||||
|
||||
# ifdef __KERNEL_ONEAPI__
|
||||
/* NOTE(@nsirgien): For us here only local memory writing (buckets) is important,
|
||||
* so faster local barriers can be used. */
|
||||
ccl_gpu_local_syncthreads();
|
||||
# else
|
||||
ccl_gpu_syncthreads();
|
||||
# endif
|
||||
|
||||
/* Determine bucket sizes within the partitions. */
|
||||
|
||||
|
@ -58,11 +59,17 @@ ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
|
|||
ushort kernel_index = d_queued_kernel[state_index];
|
||||
if (kernel_index == queued_kernel) {
|
||||
uint key = d_shader_sort_key[state_index] % max_shaders;
|
||||
atomic_fetch_and_add_uint32(&buckets[key], 1);
|
||||
atomic_fetch_and_add_uint32_shared(&buckets[key], 1);
|
||||
}
|
||||
}
|
||||
|
||||
# ifdef __KERNEL_ONEAPI__
|
||||
/* NOTE(@nsirgien): For us here only local memory writing (buckets) is important,
|
||||
* so faster local barriers can be used. */
|
||||
ccl_gpu_local_syncthreads();
|
||||
# else
|
||||
ccl_gpu_syncthreads();
|
||||
# endif
|
||||
|
||||
/* Calculate the partition's local offsets from the prefix sum of bucket sizes. */
|
||||
|
||||
|
@ -106,7 +113,13 @@ ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
|
|||
atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
|
||||
}
|
||||
|
||||
# ifdef __KERNEL_ONEAPI__
|
||||
/* NOTE(@nsirgien): For us here only local memory writing (local_offset) is important,
|
||||
* so faster local barriers can be used. */
|
||||
ccl_gpu_local_syncthreads();
|
||||
# else
|
||||
ccl_gpu_syncthreads();
|
||||
# endif
|
||||
|
||||
/* Write the sorted active indices. */
|
||||
|
||||
|
@ -121,7 +134,7 @@ ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
|
|||
ushort kernel_index = d_queued_kernel[state_index];
|
||||
if (kernel_index == queued_kernel) {
|
||||
uint key = d_shader_sort_key[state_index] % max_shaders;
|
||||
int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
|
||||
int index = atomic_fetch_and_add_uint32_shared(&local_offset[key], 1);
|
||||
if (index < num_states_limit) {
|
||||
indices[index] = state_index;
|
||||
}
|
||||
|
|
|
@ -48,6 +48,7 @@
|
|||
#define ccl_loop_no_unroll
|
||||
#define ccl_optional_struct_init
|
||||
#define ccl_private
|
||||
#define ccl_gpu_shared
|
||||
#define ATTR_FALLTHROUGH __attribute__((fallthrough))
|
||||
#define ccl_constant const
|
||||
#define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
|
||||
|
|
|
@ -367,6 +367,14 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
|
|||
# pragma GCC diagnostic error "-Wswitch"
|
||||
# endif
|
||||
|
||||
int max_shaders = 0;
|
||||
|
||||
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS ||
|
||||
device_kernel == DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS)
|
||||
{
|
||||
max_shaders = (kernel_context->scene_max_shaders);
|
||||
}
|
||||
|
||||
try {
|
||||
queue->submit([&](sycl::handler &cgh) {
|
||||
# ifdef WITH_EMBREE_GPU
|
||||
|
@ -509,13 +517,31 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
|
|||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
|
||||
sycl::local_accessor<int> local_mem(max_shaders, cgh);
|
||||
oneapi_kernel_integrator_sort_bucket_pass(kg,
|
||||
global_size,
|
||||
local_size,
|
||||
cgh,
|
||||
*(int *)(args[0]),
|
||||
*(int *)(args[1]),
|
||||
*(int *)(args[2]),
|
||||
*(int **)(args[3]),
|
||||
*(int *)(args[4]),
|
||||
local_mem);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
|
||||
oneapi_call(
|
||||
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
|
||||
sycl::local_accessor<int> local_mem(max_shaders, cgh);
|
||||
oneapi_kernel_integrator_sort_write_pass(kg,
|
||||
global_size,
|
||||
local_size,
|
||||
cgh,
|
||||
*(int *)(args[0]),
|
||||
*(int *)(args[1]),
|
||||
*(int *)(args[2]),
|
||||
*(int **)(args[3]),
|
||||
*(int *)(args[4]),
|
||||
local_mem);
|
||||
break;
|
||||
}
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
|
||||
|
|
|
@ -32,6 +32,8 @@ struct KernelContext {
|
|||
SyclQueue *queue;
|
||||
/* Pointer to USM device memory with all global/constant allocation on this device */
|
||||
void *kernel_globals;
|
||||
/* We needs this additional data for some kernels. */
|
||||
int scene_max_shaders;
|
||||
};
|
||||
|
||||
/* Use extern C linking so that the symbols can be easily load from the dynamic library at runtime.
|
||||
|
|
|
@ -21,6 +21,10 @@
|
|||
|
||||
#else /* __KERNEL_GPU__ */
|
||||
|
||||
# ifndef __KERNEL_ONEAPI__
|
||||
# define atomic_fetch_and_add_uint32_shared atomic_fetch_and_add_uint32
|
||||
# endif
|
||||
|
||||
# if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
|
||||
|
||||
# define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
|
||||
|
@ -140,6 +144,11 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
|
|||
# define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
|
||||
# define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)
|
||||
|
||||
# define atomic_store_local(p, x) \
|
||||
atomic_store_explicit((ccl_gpu_shared atomic_int *)p, x, memory_order_relaxed)
|
||||
# define atomic_load_local(p) \
|
||||
atomic_load_explicit((ccl_gpu_shared atomic_int *)p, memory_order_relaxed)
|
||||
|
||||
# define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
|
||||
# define ccl_barrier(flags) threadgroup_barrier(flags)
|
||||
|
||||
|
@ -191,6 +200,16 @@ ccl_device_inline int atomic_fetch_and_add_uint32(ccl_global int *p, int x)
|
|||
return atomic.fetch_add(x);
|
||||
}
|
||||
|
||||
ccl_device_inline int atomic_fetch_and_add_uint32_shared(int *p, int x)
|
||||
{
|
||||
sycl::atomic_ref<int,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::local_space>
|
||||
atomic(*p);
|
||||
return atomic.fetch_add(x);
|
||||
}
|
||||
|
||||
ccl_device_inline unsigned int atomic_fetch_and_sub_uint32(ccl_global unsigned int *p,
|
||||
unsigned int x)
|
||||
{
|
||||
|
@ -253,6 +272,26 @@ ccl_device_inline int atomic_fetch_and_or_uint32(ccl_global int *p, int x)
|
|||
return atomic.fetch_or(x);
|
||||
}
|
||||
|
||||
ccl_device_inline void atomic_store_local(int *p, int x)
|
||||
{
|
||||
sycl::atomic_ref<int,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::local_space>
|
||||
atomic(*p);
|
||||
atomic.store(x);
|
||||
}
|
||||
|
||||
ccl_device_inline int atomic_load_local(int *p)
|
||||
{
|
||||
sycl::atomic_ref<int,
|
||||
sycl::memory_order::relaxed,
|
||||
sycl::memory_scope::device,
|
||||
sycl::access::address_space::local_space>
|
||||
atomic(*p);
|
||||
return atomic.load();
|
||||
}
|
||||
|
||||
# endif /* __KERNEL_ONEAPI__ */
|
||||
|
||||
#endif /* __KERNEL_GPU__ */
|
||||
|
|
Loading…
Reference in New Issue