Metal: Optimize SSR shader for Apple Silicon #106231

Merged
Jeroen Bakker merged 3 commits from Jason-Fielder/blender:EEVEE_SSR_OptiMetal into main 2023-04-03 08:37:35 +02:00
1 changed files with 53 additions and 38 deletions

View File

@ -103,6 +103,20 @@ void resolve_reflection_sample(int planar_index,
weight_accum += weight; weight_accum += weight;
} }
/* NOTE(Metal): For Apple silicon GPUs executing this particular shader, by default, memory read
* pressure is high while ALU remains low. Packing the sample data into a smaller format balances
* this trade-off by reducing local shader register pressure and expensive memory look-ups into
* spilled local shader memory, resulting in an increase in performance of 20% for this shader. */
#ifdef GPU_METAL
# define SAMPLE_STORAGE_TYPE uchar
# define pack_sample(x, y) uchar(((uchar(x + 2)) << uchar(3)) + (uchar(y + 2)))
# define unpack_sample(x) vec2((char(x) >> 3) - 2, (char(x) & 7) - 2)
#else
# define SAMPLE_STORAGE_TYPE vec2
# define pack_sample(x, y) SAMPLE_STORAGE_TYPE(x, y)
# define unpack_sample(x) x
#endif
void raytrace_resolve(ClosureInputGlossy cl_in, void raytrace_resolve(ClosureInputGlossy cl_in,
inout ClosureEvalGlossy cl_eval, inout ClosureEvalGlossy cl_eval,
inout ClosureEvalCommon cl_common, inout ClosureEvalCommon cl_common,
@ -110,55 +124,55 @@ void raytrace_resolve(ClosureInputGlossy cl_in,
{ {
/* Note: Reflection samples declared in function scope to avoid per-thread memory pressure on /* Note: Reflection samples declared in function scope to avoid per-thread memory pressure on
* tile-based GPUs e.g. Apple Silicon. */ * tile-based GPUs e.g. Apple Silicon. */
const vec2 resolve_sample_offsets[36] = vec2[36]( const SAMPLE_STORAGE_TYPE resolve_sample_offsets[36] = SAMPLE_STORAGE_TYPE[36](
/* Set 1. */ /* Set 1. */
/* First Ring (2x2). */ /* First Ring (2x2). */
vec2(0, 0), pack_sample(0, 0),
/* Second Ring (6x6). */ /* Second Ring (6x6). */
vec2(-1, 3), pack_sample(-1, 3),
vec2(1, 3), pack_sample(1, 3),
vec2(-1, 1), pack_sample(-1, 1),
vec2(3, 1), pack_sample(3, 1),
vec2(-2, 0), pack_sample(-2, 0),
vec2(3, 0), pack_sample(3, 0),
vec2(2, -1), pack_sample(2, -1),
vec2(1, -2), pack_sample(1, -2),
/* Set 2. */ /* Set 2. */
/* First Ring (2x2). */ /* First Ring (2x2). */
vec2(1, 1), pack_sample(1, 1),
/* Second Ring (6x6). */ /* Second Ring (6x6). */
vec2(-2, 3), pack_sample(-2, 3),
vec2(3, 3), pack_sample(3, 3),
vec2(0, 2), pack_sample(0, 2),
vec2(2, 2), pack_sample(2, 2),
vec2(-2, -1), pack_sample(-2, -1),
vec2(1, -1), pack_sample(1, -1),
vec2(0, -2), pack_sample(0, -2),
vec2(3, -2), pack_sample(3, -2),
/* Set 3. */ /* Set 3. */
/* First Ring (2x2). */ /* First Ring (2x2). */
vec2(0, 1), pack_sample(0, 1),
/* Second Ring (6x6). */ /* Second Ring (6x6). */
vec2(0, 3), pack_sample(0, 3),
vec2(3, 2), pack_sample(3, 2),
vec2(-2, 1), pack_sample(-2, 1),
vec2(2, 1), pack_sample(2, 1),
vec2(-1, 0), pack_sample(-1, 0),
vec2(-2, -2), pack_sample(-2, -2),
vec2(0, -1), pack_sample(0, -1),
vec2(2, -2), pack_sample(2, -2),
/* Set 4. */ /* Set 4. */
/* First Ring (2x2). */ /* First Ring (2x2). */
vec2(1, 0), pack_sample(1, 0),
/* Second Ring (6x6). */ /* Second Ring (6x6). */
vec2(2, 3), pack_sample(2, 3),
vec2(-2, 2), pack_sample(-2, 2),
vec2(-1, 2), pack_sample(-1, 2),
vec2(1, 2), pack_sample(1, 2),
vec2(2, 0), pack_sample(2, 0),
vec2(-1, -1), pack_sample(-1, -1),
vec2(3, -1), pack_sample(3, -1),
vec2(-1, -2)); pack_sample(-1, -2));
float roughness = cl_in.roughness; float roughness = cl_in.roughness;
@ -208,7 +222,8 @@ void raytrace_resolve(ClosureInputGlossy cl_in,
int sample_id = sample_pool * resolve_samples_count + i; int sample_id = sample_pool * resolve_samples_count + i;
vec2 texture_size = vec2(textureSize(hitBuffer, 0)); vec2 texture_size = vec2(textureSize(hitBuffer, 0));
vec2 sample_texel = texture_size * uvcoordsvar.xy * ssrUvScale; vec2 sample_texel = texture_size * uvcoordsvar.xy * ssrUvScale;
vec2 sample_uv = (sample_texel + resolve_sample_offsets[sample_id]) / texture_size; vec2 sample_uv = (sample_texel + unpack_sample(resolve_sample_offsets[sample_id])) /
texture_size;
resolve_reflection_sample( resolve_reflection_sample(
planar_index, sample_uv, vP, vN, vV, roughness_squared, cone_tan, weight_acc, ssr_accum); planar_index, sample_uv, vP, vN, vV, roughness_squared, cone_tan, weight_acc, ssr_accum);