From bf931d3d1c4744dbdd51f5996eddc5a261ecfd33 Mon Sep 17 00:00:00 2001 From: Xavier Hallade Date: Mon, 3 Jun 2024 11:54:04 +0200 Subject: [PATCH 1/2] Cycles: extend struct-of-array-of-packed-structs to oneAPI device --- .../cycles/kernel/integrator/state_template.h | 13 +++++++++++++ intern/cycles/kernel/integrator/state_util.h | 18 +++++++++++++++--- intern/cycles/kernel/types.h | 13 ++++++++----- 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/intern/cycles/kernel/integrator/state_template.h b/intern/cycles/kernel/integrator/state_template.h index 87317842f86..c2292fc1b18 100644 --- a/intern/cycles/kernel/integrator/state_template.h +++ b/intern/cycles/kernel/integrator/state_template.h @@ -67,6 +67,7 @@ KERNEL_STRUCT_END(path) /************************************** Ray ***********************************/ +#if defined(TARGET_CPU_ARM64) || defined(__KERNEL_METAL_APPLE__) KERNEL_STRUCT_BEGIN_PACKED(ray, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER_PACKED(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER_PACKED(ray, float, dP, KERNEL_FEATURE_PATH_TRACING) @@ -77,6 +78,18 @@ KERNEL_STRUCT_MEMBER_PACKED(ray, float, tmax, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER_PACKED(ray, float, time, KERNEL_FEATURE_PATH_TRACING) KERNEL_STRUCT_MEMBER(ray, float, previous_dt, KERNEL_FEATURE_LIGHT_TREE) KERNEL_STRUCT_END(ray) +#else +KERNEL_STRUCT_BEGIN_PACKED(ray, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, packed_float3, P, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, packed_float3, D, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, float, tmin, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, float, tmax, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, float, time, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, float, dP, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER_PACKED(ray, float, dD, KERNEL_FEATURE_PATH_TRACING) +KERNEL_STRUCT_MEMBER(ray, float, previous_dt, KERNEL_FEATURE_LIGHT_TREE) +KERNEL_STRUCT_END(ray) +#endif /*************************** Intersection result ******************************/ diff --git a/intern/cycles/kernel/integrator/state_util.h b/intern/cycles/kernel/integrator/state_util.h index 99ffd20f296..d2192039b3e 100644 --- a/intern/cycles/kernel/integrator/state_util.h +++ b/intern/cycles/kernel/integrator/state_util.h @@ -16,12 +16,15 @@ ccl_device_forceinline void integrator_state_write_ray(IntegratorState state, ccl_private const Ray *ccl_restrict ray) { #if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__) +# ifdef __KERNEL_METAL_APPLE__ static_assert(sizeof(ray->P) == sizeof(float4), "Bad assumption about float3 padding"); /* dP and dP are packed based on the assumption that float3 is padded to 16 bytes. * This assumption hold trues on Metal, but not CUDA. */ ((ccl_private float4 &)ray->P).w = ray->dP; ((ccl_private float4 &)ray->D).w = ray->dD; +# endif + INTEGRATOR_STATE_WRITE(state, ray, packed) = (ccl_private packed_ray &)*ray; /* Ensure that we can correctly cast between Ray and the generated packed_ray struct. */ @@ -35,10 +38,17 @@ ccl_device_forceinline void integrator_state_write_ray(IntegratorState state, "Generated packed_ray struct is misaligned with Ray struct"); static_assert(offsetof(packed_ray, time) == offsetof(Ray, time), "Generated packed_ray struct is misaligned with Ray struct"); - static_assert(offsetof(packed_ray, dP) == 12 + offsetof(Ray, P), +# ifdef __KERNEL_METAL_APPLE__ + static_assert(offsetof(packed_ray, dP) == 12 + offsetof(Ray, dP), "Generated packed_ray struct is misaligned with Ray struct"); - static_assert(offsetof(packed_ray, dD) == 12 + offsetof(Ray, D), + static_assert(offsetof(packed_ray, dD) == 12 + offsetof(Ray, dD), "Generated packed_ray struct is misaligned with Ray struct"); +# else + static_assert(offsetof(packed_ray, dP) == offsetof(Ray, dP), + "Generated packed_ray struct is misaligned with Ray struct"); + static_assert(offsetof(packed_ray, dD) == offsetof(Ray, dD), + "Generated packed_ray struct is misaligned with Ray struct"); +# endif #else INTEGRATOR_STATE_WRITE(state, ray, P) = ray->P; INTEGRATOR_STATE_WRITE(state, ray, D) = ray->D; @@ -54,9 +64,11 @@ ccl_device_forceinline void integrator_state_read_ray(ConstIntegratorState state ccl_private Ray *ccl_restrict ray) { #if defined(__INTEGRATOR_GPU_PACKED_STATE__) && defined(__KERNEL_GPU__) - *((ccl_private packed_ray *)ray) = INTEGRATOR_STATE(state, ray, packed); +# ifdef __KERNEL_METAL_APPLE__ ray->dP = ((ccl_private float4 &)ray->P).w; ray->dD = ((ccl_private float4 &)ray->D).w; +# endif + *((ccl_private packed_ray *)ray) = INTEGRATOR_STATE(state, ray, packed); #else ray->P = INTEGRATOR_STATE(state, ray, P); ray->D = INTEGRATOR_STATE(state, ray, D); diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 7952d384680..8148dc9220a 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -752,12 +752,15 @@ typedef struct Intersection { int type; } Intersection; -/* On certain GPUs (Apple Silicon), splitting every integrator state field into its own separate - * array can be detrimental for cache utilisation. By enabling __INTEGRATOR_GPU_PACKED_STATE__, we - * specify that certain fields should be packed together. This improves cache hit ratios in cases - * where fields are often accessed together (e.g. "ray" and "isect"). +/* On certain GPUs (Apple Silicon, Intel with recent SYCL), splitting every integrator state field + * into its own separate array can be detrimental for cache utilisation. By enabling + * __INTEGRATOR_GPU_PACKED_STATE__, we specify that certain fields should be packed together. This + * improves cache hit ratios in cases where fields are often accessed together (e.g. "ray" and + * "isect"). + * This definition must be visible on host side. */ -#if defined(TARGET_CPU_ARM64) || defined(__KERNEL_METAL_APPLE__) +#if !defined(__KERNEL_GPU__) || defined(__KERNEL_METAL_APPLE__) || \ + (defined(__KERNEL_ONEAPI__) && __LIBSYCL_MAJOR_VERSION >= 7) # define __INTEGRATOR_GPU_PACKED_STATE__ /* Generate packed layouts for structs declared with KERNEL_STRUCT_BEGIN_PACKED. For example the -- 2.30.2 From fcf5f4d421324bc9900e3e9771a7b387df4d4965 Mon Sep 17 00:00:00 2001 From: Xavier Hallade Date: Tue, 4 Jun 2024 21:33:29 +0200 Subject: [PATCH 2/2] Cycles: extend packed GPU integrator state to CUDA and HIP devices --- intern/cycles/kernel/types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/intern/cycles/kernel/types.h b/intern/cycles/kernel/types.h index 8148dc9220a..2775ccdad93 100644 --- a/intern/cycles/kernel/types.h +++ b/intern/cycles/kernel/types.h @@ -759,8 +759,8 @@ typedef struct Intersection { * "isect"). * This definition must be visible on host side. */ -#if !defined(__KERNEL_GPU__) || defined(__KERNEL_METAL_APPLE__) || \ - (defined(__KERNEL_ONEAPI__) && __LIBSYCL_MAJOR_VERSION >= 7) +#if !defined(__APPLE__) || \ + defined(__APPLE__) && (defined(TARGET_CPU_ARM64) || defined(__KERNEL_METAL_APPLE__)) # define __INTEGRATOR_GPU_PACKED_STATE__ /* Generate packed layouts for structs declared with KERNEL_STRUCT_BEGIN_PACKED. For example the -- 2.30.2