2023-05-17 11:08:10 +02:00
12 changed files with 177 additions and 21 deletions
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@ -262,6 +262,11 @@ string OneapiDevice::oneapi_error_message()
  return string(oneapi_error_string_);
 }

+int OneapiDevice::scene_max_shaders()
+{
+  return scene_max_shaders_;
+}
+
 void *OneapiDevice::kernel_globals_device_pointer()
 {
  return kg_memory_device_;
@ -436,6 +441,9 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
    /* Update scene handle(since it is different for each device on multi devices) */
    KernelData *const data = (KernelData *)host;
    data->device_bvh = embree_scene;
+
+    /* We need this number later for proper local memory allocation. */
+    scene_max_shaders_ = data->max_shaders;
  }
 #  endif

--- a/intern/cycles/device/oneapi/device_impl.h
+++ b/intern/cycles/device/oneapi/device_impl.h
@ -37,6 +37,7 @@ class OneapiDevice : public Device {
  std::string oneapi_error_string_;
  bool use_hardware_raytracing = false;
  unsigned int kernel_features = 0;
+  int scene_max_shaders_ = 0;

 public:
  virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override;
@ -61,6 +62,8 @@ class OneapiDevice : public Device {

  string oneapi_error_message();

+  int scene_max_shaders();
+
  void *kernel_globals_device_pointer();

  void mem_alloc(device_memory &mem) override;
--- a/intern/cycles/device/oneapi/queue.cpp
+++ b/intern/cycles/device/oneapi/queue.cpp
@ -59,7 +59,7 @@ void OneapiDeviceQueue::init_execution()
  void *kg_dptr = (void *)oneapi_device_->kernel_globals_device_pointer();
  assert(device_queue);
  assert(kg_dptr);
-  kernel_context_ = new KernelContext{device_queue, kg_dptr};
+  kernel_context_ = new KernelContext{device_queue, kg_dptr, 0};

  debug_init_execution();
 }
@ -78,12 +78,13 @@ bool OneapiDeviceQueue::enqueue(DeviceKernel kernel,
  assert(signed_kernel_work_size >= 0);
  size_t kernel_work_size = (size_t)signed_kernel_work_size;

+  assert(kernel_context_);
+  kernel_context_->scene_max_shaders = oneapi_device_->scene_max_shaders();
+
  size_t kernel_local_size = oneapi_kernel_preferred_local_size(
      kernel_context_->queue, (::DeviceKernel)kernel, kernel_work_size);
  size_t uniformed_kernel_work_size = round_up(kernel_work_size, kernel_local_size);

-  assert(kernel_context_);
-
  /* Call the oneAPI kernel DLL to launch the requested kernel. */
  bool is_finished_ok = oneapi_device_->enqueue_kernel(
      kernel_context_, kernel, uniformed_kernel_work_size, args);
--- a/intern/cycles/device/oneapi/queue.h
+++ b/intern/cycles/device/oneapi/queue.h
@ -39,6 +39,11 @@ class OneapiDeviceQueue : public DeviceQueue {
  virtual void copy_to_device(device_memory &mem) override;
  virtual void copy_from_device(device_memory &mem) override;

+  virtual bool supports_local_atomic_sort() const
+  {
+    return true;
+  }
+
 protected:
  OneapiDevice *oneapi_device_;
  KernelContext *kernel_context_;
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@ -385,11 +385,17 @@ void PathTraceWorkGPU::enqueue_reset()

  queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
  queue_->zero_to_device(integrator_queue_counter_);
-  queue_->zero_to_device(integrator_shader_sort_counter_);
-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
+  if (integrator_shader_sort_counter_.size() != 0) {
+    queue_->zero_to_device(integrator_shader_sort_counter_);
+  }
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_NODE_RAYTRACE &&
+      integrator_shader_raytrace_sort_counter_.size() != 0)
+  {
    queue_->zero_to_device(integrator_shader_raytrace_sort_counter_);
  }
-  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE) {
+  if (device_scene_->data.kernel_features & KERNEL_FEATURE_MNEE &&
+      integrator_shader_mnee_sort_counter_.size() != 0)
+  {
    queue_->zero_to_device(integrator_shader_mnee_sort_counter_);
  }

--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@ -847,6 +847,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
    -DWITH_ONEAPI
    -ffast-math
    -O2
+    -D__KERNEL_LOCAL_ATOMIC_SORT__
    -o"${cycles_kernel_oneapi_lib}"
    -I"${CMAKE_CURRENT_SOURCE_DIR}/.."
    ${SYCL_CPP_FLAGS}
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@ -432,6 +432,17 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE)
 }
 ccl_gpu_kernel_postfix

+/* oneAPI verion needs the local_mem accessor in the arguments. */
+#ifdef __KERNEL_ONEAPI__
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index,
+                             sycl::local_accessor<int> &local_mem)
+#else
 ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_sort_bucket_pass,
                             int num_states,
@ -439,9 +450,9 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
                             int num_states_limit,
                             ccl_global int *indices,
                             int kernel_index)
+#endif
 {
 #if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
-  int max_shaders = context.launch_params_metal.data.max_shaders;
  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
                                           kernel_integrator_state.path.queued_kernel;
  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
@ -449,6 +460,20 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
  ccl_global int *key_offsets = (ccl_global int *)
                                    kernel_integrator_state.sort_partition_key_offsets;

+#  ifdef __KERNEL_METAL__
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+#  endif
+
+#  ifdef __KERNEL_ONEAPI__
+  /* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
+   * uses metal_*, we need the below to be compatible with these kernels. */
+  int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
+  int metal_local_id = ccl_gpu_thread_idx_x;
+  int metal_local_size = ccl_gpu_block_dim_x;
+  int metal_grid_id = ccl_gpu_block_idx_x;
+  ccl_gpu_shared int *threadgroup_array = local_mem.get_pointer();
+#  endif
+
  gpu_parallel_sort_bucket_pass(num_states,
                                partition_size,
                                max_shaders,
@ -456,7 +481,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
                                d_queued_kernel,
                                d_shader_sort_key,
                                key_offsets,
-                                (threadgroup int *)threadgroup_array,
+                                (ccl_gpu_shared int *)threadgroup_array,
                                metal_local_id,
                                metal_local_size,
                                metal_grid_id);
@ -464,6 +489,17 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
 }
 ccl_gpu_kernel_postfix

+/* oneAPI verion needs the local_mem accessor in the arguments. */
+#ifdef __KERNEL_ONEAPI__
+ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
+    ccl_gpu_kernel_signature(integrator_sort_write_pass,
+                             int num_states,
+                             int partition_size,
+                             int num_states_limit,
+                             ccl_global int *indices,
+                             int kernel_index,
+                             sycl::local_accessor<int> &local_mem)
+#else
 ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
    ccl_gpu_kernel_signature(integrator_sort_write_pass,
                             int num_states,
@ -471,9 +507,10 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
                             int num_states_limit,
                             ccl_global int *indices,
                             int kernel_index)
+#endif
+
 {
 #if defined(__KERNEL_LOCAL_ATOMIC_SORT__)
-  int max_shaders = context.launch_params_metal.data.max_shaders;
  ccl_global ushort *d_queued_kernel = (ccl_global ushort *)
                                           kernel_integrator_state.path.queued_kernel;
  ccl_global uint *d_shader_sort_key = (ccl_global uint *)
@ -481,6 +518,20 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
  ccl_global int *key_offsets = (ccl_global int *)
                                    kernel_integrator_state.sort_partition_key_offsets;

+#  ifdef __KERNEL_METAL__
+  int max_shaders = context.launch_params_metal.data.max_shaders;
+#  endif
+
+#  ifdef __KERNEL_ONEAPI__
+  /* Metal backend doesn't have these particular ccl_gpu_* defines and current kernel code
+   * uses metal_*, we need the below to be compatible with these kernels. */
+  int max_shaders = ((ONEAPIKernelContext *)kg)->__data->max_shaders;
+  int metal_local_id = ccl_gpu_thread_idx_x;
+  int metal_local_size = ccl_gpu_block_dim_x;
+  int metal_grid_id = ccl_gpu_block_idx_x;
+  ccl_gpu_shared int *threadgroup_array = local_mem.get_pointer();
+#  endif
+
  gpu_parallel_sort_write_pass(num_states,
                               partition_size,
                               max_shaders,
@ -490,7 +541,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_SORT_BLOCK_SIZE)
                               d_queued_kernel,
                               d_shader_sort_key,
                               key_offsets,
-                               (threadgroup int *)threadgroup_array,
+                               (ccl_gpu_shared int *)threadgroup_array,
                               metal_local_id,
                               metal_local_size,
                               metal_grid_id);
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@ -23,11 +23,6 @@ CCL_NAMESPACE_BEGIN

 #if defined(__KERNEL_LOCAL_ATOMIC_SORT__)

-#  define atomic_store_local(p, x) \
-    atomic_store_explicit((threadgroup atomic_int *)p, x, memory_order_relaxed)
-#  define atomic_load_local(p) \
-    atomic_load_explicit((threadgroup atomic_int *)p, memory_order_relaxed)
-
 ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
                                                     const uint partition_size,
                                                     const uint max_shaders,
@ -45,7 +40,13 @@ ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
    atomic_store_local(&buckets[local_id], 0);
  }

+#  ifdef __KERNEL_ONEAPI__
+  /* NOTE(@nsirgien): For us here only local memory writing (buckets) is important,
+   * so faster local barriers can be used. */
+  ccl_gpu_local_syncthreads();
+#  else
  ccl_gpu_syncthreads();
+#  endif

  /* Determine bucket sizes within the partitions. */

@ -58,11 +59,17 @@ ccl_device_inline void gpu_parallel_sort_bucket_pass(const uint num_states,
    ushort kernel_index = d_queued_kernel[state_index];
    if (kernel_index == queued_kernel) {
      uint key = d_shader_sort_key[state_index] % max_shaders;
-      atomic_fetch_and_add_uint32(&buckets[key], 1);
+      atomic_fetch_and_add_uint32_shared(&buckets[key], 1);
    }
  }

+#  ifdef __KERNEL_ONEAPI__
+  /* NOTE(@nsirgien): For us here only local memory writing (buckets) is important,
+   * so faster local barriers can be used. */
+  ccl_gpu_local_syncthreads();
+#  else
  ccl_gpu_syncthreads();
+#  endif

  /* Calculate the partition's local offsets from the prefix sum of bucket sizes. */

@ -106,7 +113,13 @@ ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
    atomic_store_local(&local_offset[local_id], key_offsets[local_id] + partition_offset);
  }

+#  ifdef __KERNEL_ONEAPI__
+  /* NOTE(@nsirgien): For us here only local memory writing (local_offset) is important,
+   * so faster local barriers can be used. */
+  ccl_gpu_local_syncthreads();
+#  else
  ccl_gpu_syncthreads();
+#  endif

  /* Write the sorted active indices. */

@ -121,7 +134,7 @@ ccl_device_inline void gpu_parallel_sort_write_pass(const uint num_states,
    ushort kernel_index = d_queued_kernel[state_index];
    if (kernel_index == queued_kernel) {
      uint key = d_shader_sort_key[state_index] % max_shaders;
-      int index = atomic_fetch_and_add_uint32(&local_offset[key], 1);
+      int index = atomic_fetch_and_add_uint32_shared(&local_offset[key], 1);
      if (index < num_states_limit) {
        indices[index] = state_index;
      }
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@ -48,6 +48,7 @@
 #define ccl_loop_no_unroll
 #define ccl_optional_struct_init
 #define ccl_private
+#define ccl_gpu_shared
 #define ATTR_FALLTHROUGH __attribute__((fallthrough))
 #define ccl_constant const
 #define ccl_try_align(...) __attribute__((aligned(__VA_ARGS__)))
--- a/intern/cycles/kernel/device/oneapi/kernel.cpp
+++ b/intern/cycles/kernel/device/oneapi/kernel.cpp
@ -367,6 +367,14 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
 #    pragma GCC diagnostic error "-Wswitch"
 #  endif

+  int max_shaders = 0;
+
+  if (device_kernel == DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS ||
+      device_kernel == DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS)
+  {
+    max_shaders = (kernel_context->scene_max_shaders);
+  }
+
  try {
    queue->submit([&](sycl::handler &cgh) {
 #  ifdef WITH_EMBREE_GPU
@ -509,13 +517,31 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
          break;
        }
        case DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS: {
-          oneapi_call(
-              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_bucket_pass);
+          sycl::local_accessor<int> local_mem(max_shaders, cgh);
+          oneapi_kernel_integrator_sort_bucket_pass(kg,
+                                                    global_size,
+                                                    local_size,
+                                                    cgh,
+                                                    *(int *)(args[0]),
+                                                    *(int *)(args[1]),
+                                                    *(int *)(args[2]),
+                                                    *(int **)(args[3]),
+                                                    *(int *)(args[4]),
+                                                    local_mem);
          break;
        }
        case DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS: {
-          oneapi_call(
-              kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sort_write_pass);
+          sycl::local_accessor<int> local_mem(max_shaders, cgh);
+          oneapi_kernel_integrator_sort_write_pass(kg,
+                                                   global_size,
+                                                   local_size,
+                                                   cgh,
+                                                   *(int *)(args[0]),
+                                                   *(int *)(args[1]),
+                                                   *(int *)(args[2]),
+                                                   *(int **)(args[3]),
+                                                   *(int *)(args[4]),
+                                                   local_mem);
          break;
        }
        case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
--- a/intern/cycles/kernel/device/oneapi/kernel.h
+++ b/intern/cycles/kernel/device/oneapi/kernel.h
@ -32,6 +32,8 @@ struct KernelContext {
  SyclQueue *queue;
  /* Pointer to USM device memory with all global/constant allocation on this device */
  void *kernel_globals;
+  /* We needs this additional data for some kernels. */
+  int scene_max_shaders;
 };

 /* Use extern C linking so that the symbols can be easily load from the dynamic library at runtime.
--- a/intern/cycles/util/atomic.h
+++ b/intern/cycles/util/atomic.h
@ -21,6 +21,10 @@

 #else /* __KERNEL_GPU__ */

+#  ifndef __KERNEL_ONEAPI__
+#    define atomic_fetch_and_add_uint32_shared atomic_fetch_and_add_uint32
+#  endif
+
 #  if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)

 #    define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
@ -140,6 +144,11 @@ ccl_device_inline float atomic_compare_and_swap_float(volatile ccl_global float
 #    define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
 #    define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)

+#    define atomic_store_local(p, x) \
+      atomic_store_explicit((ccl_gpu_shared atomic_int *)p, x, memory_order_relaxed)
+#    define atomic_load_local(p) \
+      atomic_load_explicit((ccl_gpu_shared atomic_int *)p, memory_order_relaxed)
+
 #    define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
 #    define ccl_barrier(flags) threadgroup_barrier(flags)

@ -191,6 +200,16 @@ ccl_device_inline int atomic_fetch_and_add_uint32(ccl_global int *p, int x)
  return atomic.fetch_add(x);
 }

+ccl_device_inline int atomic_fetch_and_add_uint32_shared(int *p, int x)
+{
+  sycl::atomic_ref<int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::local_space>
+      atomic(*p);
+  return atomic.fetch_add(x);
+}
+
 ccl_device_inline unsigned int atomic_fetch_and_sub_uint32(ccl_global unsigned int *p,
                                                           unsigned int x)
 {
@ -253,6 +272,26 @@ ccl_device_inline int atomic_fetch_and_or_uint32(ccl_global int *p, int x)
  return atomic.fetch_or(x);
 }

+ccl_device_inline void atomic_store_local(int *p, int x)
+{
+  sycl::atomic_ref<int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::local_space>
+      atomic(*p);
+  atomic.store(x);
+}
+
+ccl_device_inline int atomic_load_local(int *p)
+{
+  sycl::atomic_ref<int,
+                   sycl::memory_order::relaxed,
+                   sycl::memory_scope::device,
+                   sycl::access::address_space::local_space>
+      atomic(*p);
+  return atomic.load();
+}
+
 #  endif /* __KERNEL_ONEAPI__ */

 #endif /* __KERNEL_GPU__ */