diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_accumulator_lib.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_accumulator_lib.glsl index 1da741d7609..957c9b01a2a 100644 --- a/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_accumulator_lib.glsl +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_depth_of_field_accumulator_lib.glsl @@ -590,7 +590,7 @@ void dof_gather_accumulator(sampler2D color_tx, * The full pixel neighborhood is gathered. * \{ */ -void dof_slight_focus_gather(sampler2D depth_tx, +void dof_slight_focus_gather(depth2D depth_tx, sampler2D color_tx, sampler2D bkh_lut_tx, /* Renamed because of ugly macro job. */ float radius, diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_hiz_update_comp.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_hiz_update_comp.glsl index 479a6b590b0..cea25ef7ce0 100644 --- a/source/blender/draw/engines/eevee_next/shaders/eevee_hiz_update_comp.glsl +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_hiz_update_comp.glsl @@ -62,7 +62,7 @@ void main() int mask_shift = 1; #define downsample_level(out_mip__, lod_) \ - active_thread = all(lessThan(local_px, gl_WorkGroupSize.xy >> uint(mask_shift))); \ + active_thread = all(lessThan(uvec2(local_px), gl_WorkGroupSize.xy >> uint(mask_shift))); \ barrier(); /* Wait for previous writes to finish. */ \ if (active_thread) { \ max_depth = max_v4(load_local_depths(local_px)); \ @@ -89,12 +89,12 @@ void main() } finished_tile_counter = 0u; - ivec2 iter = divide_ceil(imageSize(out_mip_5), ivec2(gl_WorkGroupSize * 2u)); + ivec2 iter = divide_ceil(imageSize(out_mip_5), ivec2(gl_WorkGroupSize.xy * 2u)); ivec2 image_border = imageSize(out_mip_5) - 1; for (int y = 0; y < iter.y; y++) { for (int x = 0; x < iter.x; x++) { /* Load result of the other work groups. */ - kernel_origin = ivec2(gl_WorkGroupSize) * ivec2(x, y); + kernel_origin = ivec2(gl_WorkGroupSize.xy) * ivec2(x, y); src_px = ivec2(kernel_origin + local_px) * 2; vec4 samp; samp.x = imageLoad(out_mip_5, min(src_px + ivec2(0, 1), image_border)).x; diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_tile_comp.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_tile_comp.glsl index 37705e22b22..6479f4f98ff 100644 --- a/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_tile_comp.glsl +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_light_culling_tile_comp.glsl @@ -168,13 +168,14 @@ void main() } /* Fallthrough to the hemispheric case. */ case LIGHT_RECT: - case LIGHT_ELLIPSE: + case LIGHT_ELLIPSE: { vec3 v000 = vP - v_right * radius - v_up * radius; vec3 v100 = v000 + v_right * (radius * 2.0); vec3 v010 = v000 + v_up * (radius * 2.0); vec3 v001 = v000 - v_back * radius; Box bbox = shape_box(v000, v100, v010, v001); intersect_tile = intersect_tile && intersect(tile, bbox); + } break; default: break; } diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_motion_blur_dilate_comp.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_motion_blur_dilate_comp.glsl index 07139ea6a09..e365da53d2b 100644 --- a/source/blender/draw/engines/eevee_next/shaders/eevee_motion_blur_dilate_comp.glsl +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_motion_blur_dilate_comp.glsl @@ -74,8 +74,10 @@ void main() vec4 max_motion = imageLoad(in_tiles_img, src_tile); - MotionPayload payload_prv = motion_blur_tile_indirection_pack_payload(max_motion.xy, src_tile); - MotionPayload payload_nxt = motion_blur_tile_indirection_pack_payload(max_motion.zw, src_tile); + MotionPayload payload_prv = motion_blur_tile_indirection_pack_payload(max_motion.xy, + uvec2(src_tile)); + MotionPayload payload_nxt = motion_blur_tile_indirection_pack_payload(max_motion.zw, + uvec2(src_tile)); if (true) { /* Rectangular area (in tiles) where the motion vector spreads. */ MotionRect motion_rect = compute_motion_rect(src_tile, max_motion.xy); @@ -85,17 +87,20 @@ void main() for (int y = 0; y < motion_rect.extent.y; y++) { ivec2 tile = motion_rect.bottom_left + ivec2(x, y); if (is_inside_motion_line(tile, motion_line)) { - motion_blur_tile_indirection_store(tile_indirection_buf, MOTION_PREV, tile, payload_prv); + motion_blur_tile_indirection_store( + tile_indirection_buf, MOTION_PREV, uvec2(tile), payload_prv); /* FIXME: This is a bit weird, but for some reason, we need the store the same vector in * the motion next so that weighting in gather pass is better. */ - motion_blur_tile_indirection_store(tile_indirection_buf, MOTION_NEXT, tile, payload_nxt); + motion_blur_tile_indirection_store( + tile_indirection_buf, MOTION_NEXT, uvec2(tile), payload_nxt); } } } } if (true) { - MotionPayload payload = motion_blur_tile_indirection_pack_payload(max_motion.zw, src_tile); + MotionPayload payload = motion_blur_tile_indirection_pack_payload(max_motion.zw, + uvec2(src_tile)); /* Rectangular area (in tiles) where the motion vector spreads. */ MotionRect motion_rect = compute_motion_rect(src_tile, max_motion.zw); MotionLine motion_line = compute_motion_line(src_tile, max_motion.zw); @@ -104,10 +109,12 @@ void main() for (int y = 0; y < motion_rect.extent.y; y++) { ivec2 tile = motion_rect.bottom_left + ivec2(x, y); if (is_inside_motion_line(tile, motion_line)) { - motion_blur_tile_indirection_store(tile_indirection_buf, MOTION_NEXT, tile, payload_nxt); + motion_blur_tile_indirection_store( + tile_indirection_buf, MOTION_NEXT, uvec2(tile), payload_nxt); /* FIXME: This is a bit weird, but for some reason, we need the store the same vector in * the motion next so that weighting in gather pass is better. */ - motion_blur_tile_indirection_store(tile_indirection_buf, MOTION_PREV, tile, payload_prv); + motion_blur_tile_indirection_store( + tile_indirection_buf, MOTION_PREV, uvec2(tile), payload_prv); } } } diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_motion_blur_gather_comp.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_motion_blur_gather_comp.glsl index 5249e6637b6..1408f28e585 100644 --- a/source/blender/draw/engines/eevee_next/shaders/eevee_motion_blur_gather_comp.glsl +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_motion_blur_gather_comp.glsl @@ -178,10 +178,10 @@ void main() vec4 max_motion; /* Load dilation result from the indirection table. */ ivec2 tile_prev; - motion_blur_tile_indirection_load(tile_indirection_buf, MOTION_PREV, tile, tile_prev); + motion_blur_tile_indirection_load(tile_indirection_buf, MOTION_PREV, uvec2(tile), tile_prev); max_motion.xy = imageLoad(in_tiles_img, tile_prev).xy; ivec2 tile_next; - motion_blur_tile_indirection_load(tile_indirection_buf, MOTION_NEXT, tile, tile_next); + motion_blur_tile_indirection_load(tile_indirection_buf, MOTION_NEXT, uvec2(tile), tile_next); max_motion.zw = imageLoad(in_tiles_img, tile_next).zw; Accumulator accum; diff --git a/source/blender/draw/engines/eevee_next/shaders/eevee_nodetree_lib.glsl b/source/blender/draw/engines/eevee_next/shaders/eevee_nodetree_lib.glsl index 965780d9bcf..c19ca7d17ff 100644 --- a/source/blender/draw/engines/eevee_next/shaders/eevee_nodetree_lib.glsl +++ b/source/blender/draw/engines/eevee_next/shaders/eevee_nodetree_lib.glsl @@ -240,13 +240,13 @@ void output_aov(vec4 color, float value, uint hash) #if defined(MAT_AOV_SUPPORT) && defined(GPU_FRAGMENT_SHADER) for (int i = 0; i < AOV_MAX && i < aov_buf.color_len; i++) { if (aov_buf.hash_color[i] == hash) { - imageStore(aov_color_img, ivec3(gl_FragCoord.xy, i), color); + imageStore(aov_color_img, ivec3(ivec2(gl_FragCoord.xy), i), color); return; } } for (int i = 0; i < AOV_MAX && i < aov_buf.value_len; i++) { if (aov_buf.hash_value[i] == hash) { - imageStore(aov_value_img, ivec3(gl_FragCoord.xy, i), vec4(value)); + imageStore(aov_value_img, ivec3(ivec2(gl_FragCoord.xy), i), vec4(value)); return; } } diff --git a/source/blender/draw/intern/draw_curves.cc b/source/blender/draw/intern/draw_curves.cc index 48e9cca98e0..2789de4e7b2 100644 --- a/source/blender/draw/intern/draw_curves.cc +++ b/source/blender/draw/intern/draw_curves.cc @@ -33,7 +33,12 @@ BLI_INLINE eParticleRefineShaderType drw_curves_shader_type_get() { - if (GPU_compute_shader_support() && GPU_shader_storage_buffer_objects_support()) { + /* NOTE: Curve refine is faster using transform feedback via vertex processing pipeline with + * Metal and Apple Silicon GPUs. This is also because vertex work can more easily be executed in + * parallel with fragment work, whereas compute inserts an explicit dependency, + * due to switching of command encoder types. */ + if (GPU_compute_shader_support() && GPU_shader_storage_buffer_objects_support() && + (GPU_backend_get_type() != GPU_BACKEND_METAL)) { return PART_REFINE_SHADER_COMPUTE; } if (GPU_transform_feedback_support()) { diff --git a/source/blender/draw/intern/draw_hair.cc b/source/blender/draw/intern/draw_hair.cc index e8220346d25..de06d557929 100644 --- a/source/blender/draw/intern/draw_hair.cc +++ b/source/blender/draw/intern/draw_hair.cc @@ -36,7 +36,12 @@ BLI_INLINE eParticleRefineShaderType drw_hair_shader_type_get() { - if (GPU_compute_shader_support() && GPU_shader_storage_buffer_objects_support()) { + /* NOTE: Hair refine is faster using transform feedback via vertex processing pipeline with Metal + * and Apple Silicon GPUs. This is also because vertex work can more easily be executed in + * parallel with fragment work, whereas compute inserts an explicit dependency, + * due to switching of command encoder types. */ + if (GPU_compute_shader_support() && GPU_shader_storage_buffer_objects_support() && + (GPU_backend_get_type() != GPU_BACKEND_METAL)) { return PART_REFINE_SHADER_COMPUTE; } if (GPU_transform_feedback_support()) { diff --git a/source/blender/draw/intern/draw_shader_shared.h b/source/blender/draw/intern/draw_shader_shared.h index 28090ef2b46..b8e992b448f 100644 --- a/source/blender/draw/intern/draw_shader_shared.h +++ b/source/blender/draw/intern/draw_shader_shared.h @@ -329,6 +329,14 @@ struct DRWDebugVert { uint pos2; /* Named vert_color to avoid global namespace collision with uniform color. */ uint vert_color; + +#ifdef GPU_METAL + inline DRWDebugVert() = default; + inline DRWDebugVert(uint in_pos0, uint in_pos1, uint in_pos2, uint in_vert_color) + : pos0(in_pos0), pos1(in_pos1), pos2(in_pos2), vert_color(in_vert_color) + { + } +#endif }; BLI_STATIC_ASSERT_ALIGN(DRWDebugVert, 16) diff --git a/source/blender/draw/intern/shaders/common_shape_lib.glsl b/source/blender/draw/intern/shaders/common_shape_lib.glsl index 56722c417aa..016ef944859 100644 --- a/source/blender/draw/intern/shaders/common_shape_lib.glsl +++ b/source/blender/draw/intern/shaders/common_shape_lib.glsl @@ -14,6 +14,13 @@ struct Circle { vec2 center; float radius; + +#ifdef GPU_METAL + inline Circle() = default; + inline Circle(vec2 in_center, float in_radius) : center(in_center), radius(in_radius) + { + } +#endif }; Circle shape_circle(vec2 center, float radius) @@ -30,6 +37,13 @@ Circle shape_circle(vec2 center, float radius) struct Sphere { vec3 center; float radius; + +#ifdef GPU_METAL + inline Sphere() = default; + inline Sphere(vec3 in_center, float in_radius) : center(in_center), radius(in_radius) + { + } +#endif }; Sphere shape_sphere(vec3 center, float radius) @@ -192,6 +206,14 @@ Frustum shape_frustum(vec3 corners[8]) struct Cone { vec3 direction; float angle_cos; + +#ifdef GPU_METAL + inline Cone() = default; + inline Cone(vec3 in_direction, float in_angle_cos) + : direction(in_direction), angle_cos(in_angle_cos) + { + } +#endif }; Cone shape_cone(vec3 direction, float angle_cosine) diff --git a/source/blender/draw/intern/shaders/draw_view_finalize_comp.glsl b/source/blender/draw/intern/shaders/draw_view_finalize_comp.glsl index f3af010a47c..c7917357b2c 100644 --- a/source/blender/draw/intern/shaders/draw_view_finalize_comp.glsl +++ b/source/blender/draw/intern/shaders/draw_view_finalize_comp.glsl @@ -33,7 +33,7 @@ void projmat_dimensions(mat4 winmat, } } -void frustum_boundbox_calc(mat4 winmat, mat4 viewinv, out vec4 corners[8]) +void frustum_boundbox_calc(mat4 winmat, mat4 viewinv, DEVICE_OUT_ARRAY(vec4, corners, 8)) { float left, right, bottom, top, near, far; bool is_persp = winmat[3][3] == 0.0; @@ -68,12 +68,12 @@ void frustum_boundbox_calc(mat4 winmat, mat4 viewinv, out vec4 corners[8]) } void planes_from_projmat(mat4 mat, - out vec4 left, - out vec4 right, - out vec4 bottom, - out vec4 top, - out vec4 near, - out vec4 far) + DEVICE_OUT(vec4, left), + DEVICE_OUT(vec4, right), + DEVICE_OUT(vec4, bottom), + DEVICE_OUT(vec4, top), + DEVICE_OUT(vec4, near), + DEVICE_OUT(vec4, far)) { /* References: * @@ -89,7 +89,7 @@ void planes_from_projmat(mat4 mat, far = mat[3] - mat[2]; } -void frustum_culling_planes_calc(mat4 winmat, mat4 viewmat, out vec4 planes[6]) +void frustum_culling_planes_calc(mat4 winmat, mat4 viewmat, DEVICE_OUT_ARRAY(vec4, planes, 6)) { mat4 persmat = winmat * viewmat; planes_from_projmat(persmat, planes[0], planes[5], planes[1], planes[3], planes[4], planes[2]); @@ -100,7 +100,7 @@ void frustum_culling_planes_calc(mat4 winmat, mat4 viewmat, out vec4 planes[6]) } } -vec4 frustum_culling_sphere_calc(vec4 corners[8]) +vec4 frustum_culling_sphere_calc(device vec4 corners[8]) { /* Extract Bounding Sphere */ /* TODO(fclem): This is significantly less precise than CPU, but it isn't used in most cases. */ diff --git a/source/blender/gpu/CMakeLists.txt b/source/blender/gpu/CMakeLists.txt index 4b378384a9d..a80ff64b177 100644 --- a/source/blender/gpu/CMakeLists.txt +++ b/source/blender/gpu/CMakeLists.txt @@ -240,6 +240,7 @@ set(METAL_SRC metal/mtl_shader_generator.mm metal/mtl_shader_interface.mm metal/mtl_state.mm + metal/mtl_storage_buffer.mm metal/mtl_texture.mm metal/mtl_texture_util.mm metal/mtl_uniform_buffer.mm @@ -265,6 +266,7 @@ set(METAL_SRC metal/mtl_shader_interface_type.hh metal/mtl_shader_shared.h metal/mtl_state.hh + metal/mtl_storage_buffer.hh metal/mtl_texture.hh metal/mtl_uniform_buffer.hh metal/mtl_vertex_buffer.hh diff --git a/source/blender/gpu/metal/mtl_backend.mm b/source/blender/gpu/metal/mtl_backend.mm index c4a3f38b83b..280520177f3 100644 --- a/source/blender/gpu/metal/mtl_backend.mm +++ b/source/blender/gpu/metal/mtl_backend.mm @@ -16,6 +16,7 @@ #include "mtl_index_buffer.hh" #include "mtl_query.hh" #include "mtl_shader.hh" +#include "mtl_storage_buffer.hh" #include "mtl_uniform_buffer.hh" #include "mtl_vertex_buffer.hh" @@ -100,8 +101,7 @@ UniformBuf *MTLBackend::uniformbuf_alloc(int size, const char *name) StorageBuf *MTLBackend::storagebuf_alloc(int size, GPUUsageType usage, const char *name) { - /* TODO(Metal): Implement MTLStorageBuf. */ - return nullptr; + return new MTLStorageBuf(size, usage, name); } VertBuf *MTLBackend::vertbuf_alloc() @@ -398,16 +398,16 @@ void MTLBackend::capabilities_init(MTLContext *ctx) GCaps.shader_image_load_store_support = ([device supportsFamily:MTLGPUFamilyApple3] || MTLBackend::capabilities.supports_family_mac1 || MTLBackend::capabilities.supports_family_mac2); + GCaps.compute_shader_support = true; + GCaps.shader_storage_buffer_objects_support = false; /* TODO(Metal): Add support? */ GCaps.shader_draw_parameters_support = false; - GCaps.compute_shader_support = true; + GCaps.geometry_shader_support = false; - GCaps.shader_storage_buffer_objects_support = - false; /* TODO(Metal): implement Storage Buffer support. */ /* Maximum buffer bindings: 31. Consider required slot for uniforms/UBOs/Vertex attributes. * Can use argument buffers if a higher limit is required. */ - GCaps.max_shader_storage_buffer_bindings = 24; + GCaps.max_shader_storage_buffer_bindings = 14; if (GCaps.compute_shader_support) { GCaps.max_work_group_count[0] = 65535; diff --git a/source/blender/gpu/metal/mtl_capabilities.hh b/source/blender/gpu/metal/mtl_capabilities.hh index f236bae4e92..a564549da45 100644 --- a/source/blender/gpu/metal/mtl_capabilities.hh +++ b/source/blender/gpu/metal/mtl_capabilities.hh @@ -18,7 +18,12 @@ namespace gpu { #define MTL_MAX_DEFAULT_SAMPLERS 16 /* Total maximum buffers which can be bound to an encoder, for use within a shader. * MTL_MAX_UNIFORM_BUFFER_BINDINGS + MTL_MAX_STORAGE_BUFFER_BINDINGS must be <= - * than MTL_MAX_BUFFER_BINDINGS. */ + * than MTL_MAX_BUFFER_BINDINGS. + * We also require an additional 3 core buffers for: + * - Argument buffer for bindless resources (e.g. samplers) + * - Transform feedback buffer + * - Default push constant block + * Along with up to 6+1 buffers for vertex data, and index data. */ #define MTL_MAX_BUFFER_BINDINGS 31 #define MTL_MAX_UNIFORM_BUFFER_BINDINGS 16 #define MTL_MAX_STORAGE_BUFFER_BINDINGS 12 diff --git a/source/blender/gpu/metal/mtl_context.hh b/source/blender/gpu/metal/mtl_context.hh index 5b37fefa2d8..05f39652466 100644 --- a/source/blender/gpu/metal/mtl_context.hh +++ b/source/blender/gpu/metal/mtl_context.hh @@ -46,6 +46,7 @@ namespace blender::gpu { class MTLContext; class MTLCommandBufferManager; class MTLUniformBuf; +class MTLStorageBuf; /* Structs containing information on current binding state for textures and samplers. */ struct MTLTextureBinding { @@ -436,6 +437,11 @@ struct MTLUniformBufferBinding { MTLUniformBuf *ubo; }; +struct MTLStorageBufferBinding { + bool bound; + MTLStorageBuf *ssbo; +}; + struct MTLContextGlobalShaderPipelineState { bool initialised; @@ -457,6 +463,9 @@ struct MTLContextGlobalShaderPipelineState { /* Global Uniform Buffers. */ MTLUniformBufferBinding ubo_bindings[MTL_MAX_UNIFORM_BUFFER_BINDINGS]; + /* Storage buffer. */ + MTLStorageBufferBinding ssbo_bindings[MTL_MAX_STORAGE_BUFFER_BINDINGS]; + /* Context Texture bindings. */ MTLTextureBinding texture_bindings[MTL_MAX_TEXTURE_SLOTS]; MTLSamplerBinding sampler_bindings[MTL_MAX_SAMPLER_SLOTS]; diff --git a/source/blender/gpu/metal/mtl_context.mm b/source/blender/gpu/metal/mtl_context.mm index 14e53844722..f36a8accae7 100644 --- a/source/blender/gpu/metal/mtl_context.mm +++ b/source/blender/gpu/metal/mtl_context.mm @@ -12,6 +12,7 @@ #include "mtl_shader.hh" #include "mtl_shader_interface.hh" #include "mtl_state.hh" +#include "mtl_storage_buffer.hh" #include "mtl_uniform_buffer.hh" #include "mtl_vertex_buffer.hh" @@ -20,6 +21,7 @@ #include "GPU_capabilities.h" #include "GPU_matrix.h" #include "GPU_shader.h" +#include "GPU_storage_buffer.h" #include "GPU_texture.h" #include "GPU_uniform_buffer.h" #include "GPU_vertex_buffer.h" @@ -272,6 +274,16 @@ MTLContext::~MTLContext() } } + /* Unbind SSBOs. */ + for (int i = 0; i < MTL_MAX_STORAGE_BUFFER_BINDINGS; i++) { + if (this->pipeline_state.ssbo_bindings[i].bound && + this->pipeline_state.ssbo_bindings[i].ssbo != nullptr) { + GPUStorageBuf *ssbo = wrap( + static_cast(this->pipeline_state.ssbo_bindings[i].ssbo)); + GPU_storagebuf_unbind(ssbo); + } + } + /* Release Dummy resources */ this->free_dummy_resources(); @@ -360,6 +372,15 @@ void MTLContext::activate() } } + /* Reset SSBO bind state. */ + for (int i = 0; i < MTL_MAX_STORAGE_BUFFER_BINDINGS; i++) { + if (this->pipeline_state.ssbo_bindings[i].bound && + this->pipeline_state.ssbo_bindings[i].ssbo != nullptr) { + this->pipeline_state.ssbo_bindings[i].bound = false; + this->pipeline_state.ssbo_bindings[i].ssbo = nullptr; + } + } + /* Ensure imm active. */ immActivate(); } @@ -658,6 +679,10 @@ void MTLContext::pipeline_state_init() this->pipeline_state.ubo_bindings[u].bound = false; this->pipeline_state.ubo_bindings[u].ubo = nullptr; } + for (int u = 0; u < MTL_MAX_STORAGE_BUFFER_BINDINGS; u++) { + this->pipeline_state.ssbo_bindings[u].bound = false; + this->pipeline_state.ssbo_bindings[u].ssbo = nullptr; + } } /*** State defaults -- restored by GPU_state_init. ***/ @@ -1026,7 +1051,7 @@ bool MTLContext::ensure_uniform_buffer_bindings( rps.last_bound_shader_state.pso_index_ != pipeline_state_instance->shader_pso_index); - const MTLShaderUniformBlock &push_constant_block = shader_interface->get_push_constant_block(); + const MTLShaderBufferBlock &push_constant_block = shader_interface->get_push_constant_block(); if (push_constant_block.size > 0) { /* Fetch uniform buffer base binding index from pipeline_state_instance - There buffer index @@ -1061,7 +1086,7 @@ bool MTLContext::ensure_uniform_buffer_bindings( * match. This is used to support the gpu_uniformbuffer module, where the uniform data is global, * and not owned by the shader instance. */ for (const uint ubo_index : IndexRange(shader_interface->get_total_uniform_blocks())) { - const MTLShaderUniformBlock &ubo = shader_interface->get_uniform_block(ubo_index); + const MTLShaderBufferBlock &ubo = shader_interface->get_uniform_block(ubo_index); if (ubo.buffer_index >= 0) { @@ -1177,6 +1202,58 @@ bool MTLContext::ensure_uniform_buffer_bindings( } } } + + /* Bind Global GPUStorageBuf's */ + /* Iterate through expected SSBOs in the shader interface, and check if the globally bound ones + * match. This is used to support the gpu_uniformbuffer module, where the uniform data is global, + * and not owned by the shader instance. */ + for (const uint ssbo_index : IndexRange(shader_interface->get_total_storage_blocks())) { + const MTLShaderBufferBlock &ssbo = shader_interface->get_storage_block(ssbo_index); + + if (ssbo.buffer_index >= 0) { + id ssbo_buffer = nil; + int ssbo_size = 0; + UNUSED_VARS_NDEBUG(ssbo_size); + + if (this->pipeline_state.ssbo_bindings[ssbo_index].bound) { + + /* Fetch UBO global-binding properties from slot. */ + ssbo_buffer = this->pipeline_state.ssbo_bindings[ssbo_index].ssbo->get_metal_buffer(); + ssbo_size = this->pipeline_state.ssbo_bindings[ssbo_index].ssbo->get_size(); + + /* For SSBOs, we always need to ensure the buffer exists, as it may be written to. */ + BLI_assert(ssbo_buffer != nil); + BLI_assert(ssbo_size > 0); + } + else { + MTL_LOG_INFO( + "[Warning][SSBO] Shader '%s' expected SSBO '%s' to be bound at buffer index: %d -- " + "but " + "nothing was bound.\n", + shader_interface->get_name(), + shader_interface->get_name_at_offset(ssbo.name_offset), + ssbo.buffer_index); + } + + if (ssbo_buffer != nil) { + uint32_t buffer_bind_index = pipeline_state_instance->base_storage_buffer_index + + ssbo.buffer_index; + + /* Bind Vertex UBO. */ + if (bool(ssbo.stage_mask & ShaderStage::VERTEX)) { + BLI_assert(buffer_bind_index >= 0 && buffer_bind_index < MTL_MAX_BUFFER_BINDINGS); + rps.bind_vertex_buffer(ssbo_buffer, 0, buffer_bind_index); + } + + /* Bind Fragment UBOs. */ + if (bool(ssbo.stage_mask & ShaderStage::FRAGMENT)) { + BLI_assert(buffer_bind_index >= 0 && buffer_bind_index < MTL_MAX_BUFFER_BINDINGS); + rps.bind_fragment_buffer(ssbo_buffer, 0, buffer_bind_index); + } + } + } + } + return true; } @@ -1191,7 +1268,7 @@ bool MTLContext::ensure_uniform_buffer_bindings( MTLComputeState &cs = this->main_command_buffer.get_compute_state(); /* Fetch push constant block and bind. */ - const MTLShaderUniformBlock &push_constant_block = shader_interface->get_push_constant_block(); + const MTLShaderBufferBlock &push_constant_block = shader_interface->get_push_constant_block(); if (push_constant_block.size > 0) { /* Fetch uniform buffer base binding index from pipeline_state_instance - There buffer index @@ -1218,7 +1295,7 @@ bool MTLContext::ensure_uniform_buffer_bindings( * match. This is used to support the gpu_uniformbuffer module, where the uniform data is global, * and not owned by the shader instance. */ for (const uint ubo_index : IndexRange(shader_interface->get_total_uniform_blocks())) { - const MTLShaderUniformBlock &ubo = shader_interface->get_uniform_block(ubo_index); + const MTLShaderBufferBlock &ubo = shader_interface->get_uniform_block(ubo_index); if (ubo.buffer_index >= 0) { @@ -1270,7 +1347,7 @@ bool MTLContext::ensure_uniform_buffer_bindings( uint32_t buffer_bind_index = pipeline_state_instance.base_uniform_buffer_index + buffer_index; - /* Bind Vertex UBO. */ + /* Bind Compute UBO. */ if (bool(ubo.stage_mask & ShaderStage::COMPUTE)) { BLI_assert(buffer_bind_index >= 0 && buffer_bind_index < MTL_MAX_BUFFER_BINDINGS); cs.bind_compute_buffer(ubo_buffer, ubo_offset, buffer_bind_index); @@ -1286,6 +1363,52 @@ bool MTLContext::ensure_uniform_buffer_bindings( } } } + + /* Bind Global GPUStorageBuffers */ + /* Iterate through expected SSBOs in the shader interface, and check if the globally bound ones + * match. */ + for (const uint ssbo_index : IndexRange(shader_interface->get_total_storage_blocks())) { + const MTLShaderBufferBlock &ssbo = shader_interface->get_storage_block(ssbo_index); + + if (ssbo.buffer_index >= 0) { + id ssbo_buffer = nil; + int ssbo_size = 0; + + if (this->pipeline_state.ssbo_bindings[ssbo_index].bound) { + + /* Fetch UBO global-binding properties from slot. */ + ssbo_buffer = this->pipeline_state.ssbo_bindings[ssbo_index].ssbo->get_metal_buffer(); + ssbo_size = this->pipeline_state.ssbo_bindings[ssbo_index].ssbo->get_size(); + UNUSED_VARS_NDEBUG(ssbo_size); + + /* For SSBOs, we always need to ensure the buffer exists, as it may be written to. */ + BLI_assert(ssbo_buffer != nil); + BLI_assert(ssbo_size > 0); + } + else { + MTL_LOG_ERROR( + "[Error][SSBO] Shader '%s' expected SSBO '%s' to be bound at SSBO index: %d (buffer " + "%d) -- but " + "nothing was bound.\n", + shader_interface->get_name(), + shader_interface->get_name_at_offset(ssbo.name_offset), + ssbo.buffer_index, + pipeline_state_instance.base_storage_buffer_index + ssbo.buffer_index); + } + + if (ssbo_buffer != nil) { + uint32_t buffer_bind_index = pipeline_state_instance.base_storage_buffer_index + + ssbo.buffer_index; + + /* Bind Vertex UBO. */ + if (bool(ssbo.stage_mask & ShaderStage::COMPUTE)) { + BLI_assert(buffer_bind_index >= 0 && buffer_bind_index < MTL_MAX_BUFFER_BINDINGS); + cs.bind_compute_buffer(ssbo_buffer, 0, buffer_bind_index); + } + } + } + } + return true; } diff --git a/source/blender/gpu/metal/mtl_index_buffer.hh b/source/blender/gpu/metal/mtl_index_buffer.hh index dd828598110..182c1d2bf10 100644 --- a/source/blender/gpu/metal/mtl_index_buffer.hh +++ b/source/blender/gpu/metal/mtl_index_buffer.hh @@ -18,12 +18,16 @@ namespace blender::gpu { class MTLIndexBuf : public IndexBuf { friend class MTLBatch; friend class MTLDrawList; + friend class MTLStorageBuf; /* For bind as SSBO resource access. */ private: /* Metal buffer resource. */ gpu::MTLBuffer *ibo_ = nullptr; uint64_t alloc_size_ = 0; + /* SSBO wrapper for bind_as_ssbo support. */ + MTLStorageBuf *ssbo_wrapper_ = nullptr; + #ifndef NDEBUG /* Flags whether point index buffer has been compacted * to remove false restart indices. */ diff --git a/source/blender/gpu/metal/mtl_index_buffer.mm b/source/blender/gpu/metal/mtl_index_buffer.mm index 6a912983492..b59837f4c98 100644 --- a/source/blender/gpu/metal/mtl_index_buffer.mm +++ b/source/blender/gpu/metal/mtl_index_buffer.mm @@ -7,6 +7,7 @@ #include "mtl_index_buffer.hh" #include "mtl_context.hh" #include "mtl_debug.hh" +#include "mtl_storage_buffer.hh" #include "BLI_span.hh" @@ -22,6 +23,11 @@ MTLIndexBuf::~MTLIndexBuf() ibo_->free(); } this->free_optimized_buffer(); + + if (ssbo_wrapper_) { + delete ssbo_wrapper_; + ssbo_wrapper_ = nullptr; + } } void MTLIndexBuf::free_optimized_buffer() @@ -42,8 +48,14 @@ void MTLIndexBuf::bind_as_ssbo(uint32_t binding) /* Ensure we have a valid IBO. */ BLI_assert(this->ibo_); - /* TODO(Metal): Support index buffer SSBO's. Dependent on compute implementation. */ - MTL_LOG_WARNING("MTLIndexBuf::bind_as_ssbo not yet implemented!\n"); + /* Ensure resource is initialized. */ + this->upload_data(); + + /* Create MTLStorageBuffer to wrap this resource and use conventional binding. */ + if (ssbo_wrapper_ == nullptr) { + ssbo_wrapper_ = new MTLStorageBuf(this, alloc_size_); + } + ssbo_wrapper_->bind(binding); } void MTLIndexBuf::read(uint32_t *data) const diff --git a/source/blender/gpu/metal/mtl_shader.hh b/source/blender/gpu/metal/mtl_shader.hh index 095ca94cc41..8c105b1a122 100644 --- a/source/blender/gpu/metal/mtl_shader.hh +++ b/source/blender/gpu/metal/mtl_shader.hh @@ -71,7 +71,7 @@ struct MTLRenderPipelineStateInstance { * bound buffers such as vertex buffers, as the count can vary. */ int base_uniform_buffer_index; /* Base bind index for binding storage buffers. */ - int base_ssbo_buffer_index; + int base_storage_buffer_index; /* buffer bind slot used for null attributes (-1 if not needed). */ int null_attribute_buffer_index; /* buffer bind used for transform feedback output buffer. */ @@ -101,7 +101,7 @@ struct MTLComputePipelineStateInstance { * bound buffers such as vertex buffers, as the count can vary. */ int base_uniform_buffer_index = -1; /* Base bind index for binding storage buffers. */ - int base_ssbo_buffer_index = -1; + int base_storage_buffer_index = -1; int threadgroup_x_len = 1; int threadgroup_y_len = 1; diff --git a/source/blender/gpu/metal/mtl_shader.mm b/source/blender/gpu/metal/mtl_shader.mm index 3f0bcaea875..8c6379133f6 100644 --- a/source/blender/gpu/metal/mtl_shader.mm +++ b/source/blender/gpu/metal/mtl_shader.mm @@ -386,7 +386,7 @@ bool MTLShader::finalize(const shader::ShaderCreateInfo *info) valid_ = true; /* Prepare backing data storage for local uniforms. */ - const MTLShaderUniformBlock &push_constant_block = mtl_interface->get_push_constant_block(); + const MTLShaderBufferBlock &push_constant_block = mtl_interface->get_push_constant_block(); if (push_constant_block.size > 0) { push_constant_data_ = MEM_callocN(push_constant_block.size, __func__); this->push_constant_bindstate_mark_dirty(true); @@ -987,12 +987,26 @@ MTLRenderPipelineStateInstance *MTLShader::bake_pipeline_state( type:MTLDataTypeInt withName:@"MTL_uniform_buffer_base_index"]; + /* Storage buffer bind index. + * This is always relative to MTL_uniform_buffer_base_index, plus the number of active buffers, + * and an additional space for the push constant block. + * If the shader does not have any uniform blocks, then we can place directly after the push + * constant block. As we do not need an extra spot for the UBO at index '0'. */ + int MTL_storage_buffer_base_index = MTL_uniform_buffer_base_index + + ((mtl_interface->get_total_uniform_blocks() > 0) ? + (mtl_interface->get_max_ubo_index() + 2) : + (MTL_uniform_buffer_base_index + 1)); + [values setConstantValue:&MTL_storage_buffer_base_index + type:MTLDataTypeInt + withName:@"MTL_storage_buffer_base_index"]; + /* Transform feedback constant. - * Ensure buffer is placed after existing buffers, including default buffers. */ + * Ensure buffer is placed after existing buffers, including default buffers, UBOs and SSBOs. + */ int MTL_transform_feedback_buffer_index = (this->transform_feedback_type_ != GPU_SHADER_TFB_NONE) ? - MTL_uniform_buffer_base_index + - mtl_interface->get_max_ubo_index() + 2 : + MTL_storage_buffer_base_index + + mtl_interface->get_max_ssbo_index() + 2 : -1; if (this->transform_feedback_type_ != GPU_SHADER_TFB_NONE) { @@ -1150,6 +1164,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_pipeline_state( pso_inst->frag = desc.fragmentFunction; pso_inst->pso = pso; pso_inst->base_uniform_buffer_index = MTL_uniform_buffer_base_index; + pso_inst->base_storage_buffer_index = MTL_storage_buffer_base_index; pso_inst->null_attribute_buffer_index = (using_null_buffer) ? null_buffer_index : -1; pso_inst->transform_feedback_buffer_index = MTL_transform_feedback_buffer_index; pso_inst->prim_type = prim_type; @@ -1254,6 +1269,8 @@ bool MTLShader::bake_compute_pipeline_state(MTLContext *ctx) { /* NOTE(Metal): Bakes and caches a PSO for compute. */ BLI_assert(this); + MTLShaderInterface *mtl_interface = this->get_interface(); + BLI_assert(mtl_interface); BLI_assert(this->is_valid()); BLI_assert(shader_library_compute_ != nil); @@ -1275,7 +1292,19 @@ bool MTLShader::bake_compute_pipeline_state(MTLContext *ctx) type:MTLDataTypeInt withName:@"MTL_uniform_buffer_base_index"]; - /* TODO: SSBO binding base index. */ + /* Storage buffer bind index. + * This is always relative to MTL_uniform_buffer_base_index, plus the number of active buffers, + * and an additional space for the push constant block. + * If the shader does not have any uniform blocks, then we can place directly after the push + * constant block. As we do not need an extra spot for the UBO at index '0'. */ + int MTL_storage_buffer_base_index = MTL_uniform_buffer_base_index + + ((mtl_interface->get_total_uniform_blocks() > 0) ? + (mtl_interface->get_max_ubo_index() + 2) : + (MTL_uniform_buffer_base_index + 1)); + + [values setConstantValue:&MTL_storage_buffer_base_index + type:MTLDataTypeInt + withName:@"MTL_storage_buffer_base_index"]; /* Compile compute function. */ NSError *error = nullptr; @@ -1321,8 +1350,7 @@ bool MTLShader::bake_compute_pipeline_state(MTLContext *ctx) compute_pso_instance_.compute = [compute_function retain]; compute_pso_instance_.pso = [pso retain]; compute_pso_instance_.base_uniform_buffer_index = MTL_uniform_buffer_base_index; - /* TODO: Add SSBO base buffer index support. */ - compute_pso_instance_.base_ssbo_buffer_index = -1; + compute_pso_instance_.base_storage_buffer_index = MTL_storage_buffer_base_index; } return true; } diff --git a/source/blender/gpu/metal/mtl_shader_generator.hh b/source/blender/gpu/metal/mtl_shader_generator.hh index 84a921a852b..3015f5645fa 100644 --- a/source/blender/gpu/metal/mtl_shader_generator.hh +++ b/source/blender/gpu/metal/mtl_shader_generator.hh @@ -105,6 +105,22 @@ * } * \endcode * + * -- Metal buffer bindings structure -- + * + * Metal shader contains several different binding types. All buffers are bound using the buffer(N) + * binding attribute tag. However, different ranges serve different purposes. The structure of the + * bindings always happen as follows: + * + * Vertex Buffers (N) <-- 0 + * Index buffer + * Default Push constant block for uniforms <-- MTL_uniform_buffer_base_index + * Uniform buffers <-- MTL_uniform_buffer_base_index+1 + * Storage buffers <-- MTL_storage_buffer_base_index + * Samplers/argument buffer table <-- last buffer + 1 + * Transform feedback buffer <-- last_buffer + 2 + * + * Up to a maximum of 31 bindings. + * * -- SSBO-vertex-fetchmode -- * * SSBO-vertex-fetchmode is a special option wherein vertex buffers are bound directly @@ -200,13 +216,14 @@ struct MSLUniform { } }; -struct MSLUniformBlock { +struct MSLBufferBlock { std::string type_name; std::string name; ShaderStage stage; bool is_array; + shader::Qualifier qualifiers; - bool operator==(const MSLUniformBlock &right) const + bool operator==(const MSLBufferBlock &right) const { return (type_name == right.type_name && name == right.name); } @@ -369,7 +386,8 @@ class MSLGeneratorInterface { public: /** Shader stage input/output binding information. * Derived from shader source reflection or GPUShaderCreateInfo. */ - blender::Vector uniform_blocks; + blender::Vector uniform_blocks; + blender::Vector storage_blocks; blender::Vector uniforms; blender::Vector texture_samplers; blender::Vector vertex_input_attributes; @@ -385,7 +403,8 @@ class MSLGeneratorInterface { blender::Vector clip_distances; /* Shared Memory Blocks. */ blender::Vector shared_memory_blocks; - + /* Max bind IDs. */ + int max_tex_bind_index = 0; /** GL Global usage. */ /* Whether GL position is used, or an alternative vertex output should be the default. */ bool uses_gl_Position; @@ -459,8 +478,10 @@ class MSLGeneratorInterface { /* Samplers. */ bool use_argument_buffer_for_samplers() const; uint32_t num_samplers_for_stage(ShaderStage stage) const; + uint32_t max_sampler_index_for_stage(ShaderStage stage) const; - /* Returns the bind index, relative to MTL_uniform_buffer_base_index. */ + /* Returns the bind index, relative to + * MTL_uniform_buffer_base_index+MTL_storage_buffer_base_index. */ uint32_t get_sampler_argument_buffer_bind_index(ShaderStage stage); /* Code generation utility functions. */ @@ -476,7 +497,7 @@ class MSLGeneratorInterface { std::string generate_msl_fragment_entry_stub(); std::string generate_msl_compute_entry_stub(); std::string generate_msl_global_uniform_population(ShaderStage stage); - std::string generate_ubo_block_macro_chain(MSLUniformBlock block); + std::string generate_ubo_block_macro_chain(MSLBufferBlock block); std::string generate_msl_uniform_block_population(ShaderStage stage); std::string generate_msl_vertex_attribute_input_population(); std::string generate_msl_vertex_output_population(); @@ -538,7 +559,9 @@ inline bool is_builtin_type(std::string type) { /* Add Types as needed. */ /* TODO(Metal): Consider replacing this with a switch and `constexpr` hash and switch. - * Though most efficient and maintainable approach to be determined. */ + * Though most efficient and maintainable approach to be determined. + * NOTE: Some duplicate types exit for Metal and GLSL representations, as generated typenames + * from createinfo may use GLSL signature. */ static std::map glsl_builtin_types = { {"float", MTL_DATATYPE_FLOAT}, {"vec2", MTL_DATATYPE_FLOAT2}, @@ -548,10 +571,17 @@ inline bool is_builtin_type(std::string type) {"ivec2", MTL_DATATYPE_INT2}, {"ivec3", MTL_DATATYPE_INT3}, {"ivec4", MTL_DATATYPE_INT4}, + {"int2", MTL_DATATYPE_INT2}, + {"int3", MTL_DATATYPE_INT3}, + {"int4", MTL_DATATYPE_INT4}, {"uint32_t", MTL_DATATYPE_UINT}, {"uvec2", MTL_DATATYPE_UINT2}, {"uvec3", MTL_DATATYPE_UINT3}, {"uvec4", MTL_DATATYPE_UINT4}, + {"uint", MTL_DATATYPE_UINT}, + {"uint2", MTL_DATATYPE_UINT2}, + {"uint3", MTL_DATATYPE_UINT3}, + {"uint4", MTL_DATATYPE_UINT4}, {"mat3", MTL_DATATYPE_FLOAT3x3}, {"mat4", MTL_DATATYPE_FLOAT4x4}, {"bool", MTL_DATATYPE_INT}, diff --git a/source/blender/gpu/metal/mtl_shader_generator.mm b/source/blender/gpu/metal/mtl_shader_generator.mm index 0b866456ffc..db9f176fe83 100644 --- a/source/blender/gpu/metal/mtl_shader_generator.mm +++ b/source/blender/gpu/metal/mtl_shader_generator.mm @@ -709,8 +709,30 @@ static void print_resource(std::ostream &os, const ShaderCreateInfo::Resource &r } break; } - case ShaderCreateInfo::Resource::BindType::STORAGE_BUFFER: + case ShaderCreateInfo::Resource::BindType::STORAGE_BUFFER: { + int64_t array_offset = res.storagebuf.name.find_first_of("["); + bool writeable = (res.storagebuf.qualifiers & shader::Qualifier::WRITE) == + shader::Qualifier::WRITE; + const char *memory_scope = ((writeable) ? "device " : "constant "); + if (array_offset == -1) { + /* Create local class member as device pointer reference to bound SSBO. + * Given usage within a shader follows ssbo_name.ubo_element syntax, we can + * dereference the pointer as the compiler will optimize this data fetch. + * To do this, we also give the UBO name a post-fix of `_local` to avoid + * macro accessor collisions. */ + + os << memory_scope << res.storagebuf.type_name << " *" << res.storagebuf.name + << "_local;\n"; + os << "#define " << res.storagebuf.name << " (*" << res.storagebuf.name << "_local)\n"; + } + else { + /* For arrays, we can directly provide the constant access pointer, as the array + * syntax will de-reference this at the correct fetch index. */ + StringRef name_no_array = StringRef(res.storagebuf.name.c_str(), array_offset); + os << memory_scope << res.storagebuf.type_name << " *" << name_no_array << ";\n"; + } break; + } } } @@ -999,7 +1021,7 @@ bool MTLShader::generate_msl_from_glsl(const shader::ShaderCreateInfo *info) if (msl_iface.use_argument_buffer_for_samplers()) { ss_vertex << "#define USE_ARGUMENT_BUFFER_FOR_SAMPLERS 1" << std::endl; ss_vertex << "#define ARGUMENT_BUFFER_NUM_SAMPLERS " - << msl_iface.num_samplers_for_stage(ShaderStage::VERTEX) << std::endl; + << msl_iface.max_sampler_index_for_stage(ShaderStage::VERTEX) + 1 << std::endl; } if (msl_iface.uses_ssbo_vertex_fetch_mode) { ss_vertex << "#define MTL_SSBO_VERTEX_FETCH 1" << std::endl; @@ -1190,7 +1212,7 @@ bool MTLShader::generate_msl_from_glsl(const shader::ShaderCreateInfo *info) if (msl_iface.use_argument_buffer_for_samplers()) { ss_fragment << "#define USE_ARGUMENT_BUFFER_FOR_SAMPLERS 1" << std::endl; ss_fragment << "#define ARGUMENT_BUFFER_NUM_SAMPLERS " - << msl_iface.num_samplers_for_stage(ShaderStage::FRAGMENT) << std::endl; + << msl_iface.max_sampler_index_for_stage(ShaderStage::FRAGMENT) + 1 << std::endl; } /* Inject common Metal header. */ @@ -1437,7 +1459,7 @@ bool MTLShader::generate_msl_from_glsl_compute(const shader::ShaderCreateInfo *i if (msl_iface.use_argument_buffer_for_samplers()) { ss_compute << "#define USE_ARGUMENT_BUFFER_FOR_SAMPLERS 1" << std::endl; ss_compute << "#define ARGUMENT_BUFFER_NUM_SAMPLERS " - << msl_iface.num_samplers_for_stage(ShaderStage::COMPUTE) << std::endl; + << msl_iface.max_sampler_index_for_stage(ShaderStage::COMPUTE) + 1 << std::endl; } /* Inject static workgroup sizes. */ @@ -1555,6 +1577,31 @@ bool MTLShader::generate_msl_from_glsl_compute(const shader::ShaderCreateInfo *i this->set_compute_function_name(@"compute_function_entry"); #endif + /* DEBUG: Export source to file for manual verification. */ +#if MTL_SHADER_DEBUG_EXPORT_SOURCE + NSFileManager *sharedFM = [NSFileManager defaultManager]; + NSURL *app_bundle_url = [[NSBundle mainBundle] bundleURL]; + NSURL *shader_dir = [[app_bundle_url URLByDeletingLastPathComponent] + URLByAppendingPathComponent:@"Shaders/" + isDirectory:YES]; + [sharedFM createDirectoryAtURL:shader_dir + withIntermediateDirectories:YES + attributes:nil + error:nil]; + const char *path_cstr = [shader_dir fileSystemRepresentation]; + + std::ofstream compute_fs; + compute_fs.open( + (std::string(path_cstr) + "/" + std::string(this->name) + "_GeneratedComputeShader.msl") + .c_str()); + compute_fs << ss_compute.str(); + compute_fs.close(); + + shader_debug_printf( + "Compute Shader Saved to: %s\n", + (std::string(path_cstr) + std::string(this->name) + "_GeneratedComputeShader.msl").c_str()); +#endif + NSString *msl_final_compute = [NSString stringWithUTF8String:ss_compute.str().c_str()]; this->shader_compute_source_from_msl(msl_final_compute); @@ -1738,6 +1785,7 @@ void MSLGeneratorInterface::prepare_from_createinfo(const shader::ShaderCreateIn MSLTextureSampler msl_tex( ShaderStage::ANY, res.sampler.type, res.sampler.name, access, used_slot); texture_samplers.append(msl_tex); + max_tex_bind_index = max_ii(used_slot, max_tex_bind_index); } break; case shader::ShaderCreateInfo::Resource::BindType::IMAGE: { @@ -1771,14 +1819,16 @@ void MSLGeneratorInterface::prepare_from_createinfo(const shader::ShaderCreateIn access, used_slot); texture_samplers.append(msl_tex); + max_tex_bind_index = max_ii(used_slot, max_tex_bind_index); } break; case shader::ShaderCreateInfo::Resource::BindType::UNIFORM_BUFFER: { - MSLUniformBlock ubo; + MSLBufferBlock ubo; BLI_assert(res.uniformbuf.type_name.size() > 0); BLI_assert(res.uniformbuf.name.size() > 0); int64_t array_offset = res.uniformbuf.name.find_first_of("["); + ubo.qualifiers = shader::Qualifier::READ; ubo.type_name = res.uniformbuf.type_name; ubo.is_array = (array_offset > -1); if (ubo.is_array) { @@ -1794,8 +1844,24 @@ void MSLGeneratorInterface::prepare_from_createinfo(const shader::ShaderCreateIn } break; case shader::ShaderCreateInfo::Resource::BindType::STORAGE_BUFFER: { - /* TODO(Metal): Support shader storage buffer in Metal. - * Pending compute support. */ + MSLBufferBlock ssbo; + BLI_assert(res.storagebuf.type_name.size() > 0); + BLI_assert(res.storagebuf.name.size() > 0); + int64_t array_offset = res.storagebuf.name.find_first_of("["); + + ssbo.qualifiers = res.storagebuf.qualifiers; + ssbo.type_name = res.storagebuf.type_name; + ssbo.is_array = (array_offset > -1); + if (ssbo.is_array) { + /* If is array UBO, strip out array tag from name. */ + StringRef name_no_array = StringRef(res.storagebuf.name.c_str(), array_offset); + ssbo.name = name_no_array; + } + else { + ssbo.name = res.storagebuf.name; + } + ssbo.stage = ShaderStage::FRAGMENT | ShaderStage::COMPUTE; + storage_blocks.append(ssbo); } break; } } @@ -1850,10 +1916,28 @@ void MSLGeneratorInterface::prepare_from_createinfo(const shader::ShaderCreateIn bool MSLGeneratorInterface::use_argument_buffer_for_samplers() const { - /* We can only use argument buffers IF sampler count exceeds static limit of 16, - * AND we can support more samplers with an argument buffer. - * NOTE: We reserve one constant sampler within the shader for fast read via point-sampling. */ - return texture_samplers.size() >= 15 && GPU_max_samplers() > 16; + /* We can only use argument buffers IF highest sampler index exceeds static limit of 16, + * AND we can support more samplers with an argument buffer. */ + bool use_argument_buffer = (texture_samplers.size() >= 15 || max_tex_bind_index >= 14) && + GPU_max_samplers() > 15; + +#ifndef NDEBUG + /* Due to explicit bind location support, we may be below the sampler limit, but forced to offset + * bindings due to the range being high. Introduce debug check here to issue warning. In these + * cases, if explicit bind location support is not required, best to use auto_resource_location + * to optimize bind point packing. */ + if (use_argument_buffer && texture_samplers.size() < 15) { + MTL_LOG_WARNING( + "Compiled Shader '%s' is falling back to bindless via argument buffers due to having a " + "texture sampler of Index: %u Which exceeds the limit of 15+1. However shader only uses " + "%d textures. Consider optimising bind points with .auto_resource_location(true).\n", + parent_shader_.name_get(), + max_tex_bind_index, + (int)texture_samplers.size()); + } +#endif + + return use_argument_buffer; } uint32_t MSLGeneratorInterface::num_samplers_for_stage(ShaderStage stage) const @@ -1863,6 +1947,13 @@ uint32_t MSLGeneratorInterface::num_samplers_for_stage(ShaderStage stage) const return texture_samplers.size(); } +uint32_t MSLGeneratorInterface::max_sampler_index_for_stage(ShaderStage stage) const +{ + /* NOTE: Sampler bindings and argument buffer shared across stages, + * in case stages share texture/sampler bindings. */ + return max_tex_bind_index; +} + uint32_t MSLGeneratorInterface::get_sampler_argument_buffer_bind_index(ShaderStage stage) { /* Note: Shader stage must be a singular index. Compound shader masks are not valid for this @@ -1873,7 +1964,7 @@ uint32_t MSLGeneratorInterface::get_sampler_argument_buffer_bind_index(ShaderSta return sampler_argument_buffer_bind_index[get_shader_stage_index(stage)]; } sampler_argument_buffer_bind_index[get_shader_stage_index(stage)] = - (this->uniform_blocks.size() + 1); + (this->uniform_blocks.size() + this->storage_blocks.size() + 1); return sampler_argument_buffer_bind_index[get_shader_stage_index(stage)]; } @@ -2148,7 +2239,6 @@ std::string MSLGeneratorInterface::generate_msl_compute_entry_stub() out << this->generate_msl_texture_vars(ShaderStage::COMPUTE); out << this->generate_msl_global_uniform_population(ShaderStage::COMPUTE); out << this->generate_msl_uniform_block_population(ShaderStage::COMPUTE); - /* TODO(Metal): SSBO Population. */ /* Execute original 'main' function within class scope. */ out << "\t/* Execute Compute main function */\t" << std::endl @@ -2205,8 +2295,9 @@ void MSLGeneratorInterface::generate_msl_textures_input_string(std::stringstream void MSLGeneratorInterface::generate_msl_uniforms_input_string(std::stringstream &out, ShaderStage stage) { + /* Uniform buffers. */ int ubo_index = 0; - for (const MSLUniformBlock &ubo : this->uniform_blocks) { + for (const MSLBufferBlock &ubo : this->uniform_blocks) { if (bool(ubo.stage & stage)) { /* For literal/existing global types, we do not need the class name-space accessor. */ out << ",\n\tconstant "; @@ -2222,6 +2313,28 @@ void MSLGeneratorInterface::generate_msl_uniforms_input_string(std::stringstream } ubo_index++; } + + /* Storage buffers. */ + int ssbo_index = 0; + for (const MSLBufferBlock &ssbo : this->storage_blocks) { + if (bool(ssbo.stage & stage)) { + /* For literal/existing global types, we do not need the class name-space accessor. */ + bool writeable = (ssbo.qualifiers & shader::Qualifier::WRITE) == shader::Qualifier::WRITE; + const char *memory_scope = ((writeable) ? "device " : "constant "); + out << ",\n\t" << memory_scope; + if (!is_builtin_type(ssbo.type_name)) { + out << get_stage_class_name(stage) << "::"; + } + /* #StorageBuffer bind indices start at `MTL_storage_buffer_base_index`. + * MTL_storage_buffer_base_index follows immediately after all uniform blocks. + * such that MTL_storage_buffer_base_index = MTL_uniform_buffer_base_index + + * uniform_blocks.size() + 1. Where the additional buffer is reserved for the + * #PushConstantBlock (push constants). */ + out << ssbo.type_name << "* " << ssbo.name << "[[buffer(MTL_storage_buffer_base_index+" + << (ssbo_index) << ")]]"; + } + ssbo_index++; + } } std::string MSLGeneratorInterface::generate_msl_vertex_inputs_string() @@ -2372,9 +2485,13 @@ std::string MSLGeneratorInterface::generate_msl_uniform_undefs(ShaderStage shade out << "#undef " << uniform.name << std::endl; } /* UBO block undef. */ - for (const MSLUniformBlock &ubo : this->uniform_blocks) { + for (const MSLBufferBlock &ubo : this->uniform_blocks) { out << "#undef " << ubo.name << std::endl; } + /* SSBO block undef. */ + for (const MSLBufferBlock &ssbo : this->storage_blocks) { + out << "#undef " << ssbo.name << std::endl; + } return out.str(); } @@ -2656,7 +2773,7 @@ std::string MSLGeneratorInterface::generate_msl_uniform_block_population(ShaderS /* Populate Global Uniforms. */ std::stringstream out; out << "\t/* Copy UBO block references into local class variables */" << std::endl; - for (const MSLUniformBlock &ubo : this->uniform_blocks) { + for (const MSLBufferBlock &ubo : this->uniform_blocks) { /* Only include blocks which are used within this stage. */ if (bool(ubo.stage & stage)) { @@ -2672,6 +2789,26 @@ std::string MSLGeneratorInterface::generate_msl_uniform_block_population(ShaderS out << " = " << ubo.name << ";" << std::endl; } } + + /* Populate storage buffer references. */ + out << "\t/* Copy SSBO block references into local class variables */" << std::endl; + for (const MSLBufferBlock &ssbo : this->storage_blocks) { + + /* Only include blocks which are used within this stage. */ + if (bool(ssbo.stage & stage)) { + /* Generate UBO reference assignment. + * NOTE(Metal): We append `_local` post-fix onto the class member name + * for the ubo to avoid name collision with the UBO accessor macro. + * We only need to add this post-fix for the non-array access variant, + * as the array is indexed directly, rather than requiring a dereference. */ + out << "\t" << get_shader_stage_instance_name(stage) << "." << ssbo.name; + if (!ssbo.is_array) { + out << "_local"; + } + out << " = " << ssbo.name << ";" << std::endl; + } + } + out << std::endl; return out.str(); } @@ -3261,6 +3398,18 @@ MTLShaderInterface *MSLGeneratorInterface::bake_shader_interface(const char *nam this->uniform_blocks[uniform_block].stage); } + /* Prepare Interface Storage Blocks. */ + for (int storage_block = 0; storage_block < this->storage_blocks.size(); storage_block++) { + interface->add_storage_block( + name_buffer_copystr(&interface->name_buffer_, + this->storage_blocks[storage_block].name.c_str(), + name_buffer_size, + name_buffer_offset), + storage_block, + 0, + this->storage_blocks[storage_block].stage); + } + /* Texture/sampler bindings to interface. */ for (const MSLTextureSampler &texture_sampler : this->texture_samplers) { interface->add_texture(name_buffer_copystr(&interface->name_buffer_, diff --git a/source/blender/gpu/metal/mtl_shader_interface.hh b/source/blender/gpu/metal/mtl_shader_interface.hh index da9489685b8..b95a3dcc3f4 100644 --- a/source/blender/gpu/metal/mtl_shader_interface.hh +++ b/source/blender/gpu/metal/mtl_shader_interface.hh @@ -107,7 +107,7 @@ struct MTLShaderInputAttribute { uint32_t matrix_element_count; }; -struct MTLShaderUniformBlock { +struct MTLShaderBufferBlock { uint32_t name_offset; uint32_t size = 0; /* Buffer resource bind index in shader `[[buffer(index)]]`. */ @@ -120,7 +120,7 @@ struct MTLShaderUniformBlock { struct MTLShaderUniform { uint32_t name_offset; - /* Index of `MTLShaderUniformBlock` this uniform belongs to. */ + /* Index of `MTLShaderBufferBlock` this uniform belongs to. */ uint32_t size_in_bytes; uint32_t byte_offset; eMTLDataType type; @@ -173,8 +173,13 @@ class MTLShaderInterface : public ShaderInterface { /* Uniform Blocks. */ uint32_t total_uniform_blocks_; uint32_t max_uniformbuf_index_; - MTLShaderUniformBlock ubos_[MTL_MAX_UNIFORM_BUFFER_BINDINGS]; - MTLShaderUniformBlock push_constant_block_; + MTLShaderBufferBlock ubos_[MTL_MAX_UNIFORM_BUFFER_BINDINGS]; + MTLShaderBufferBlock push_constant_block_; + + /* Storage blocks. */ + uint32_t total_storage_blocks_; + uint32_t max_storagebuf_index_; + MTLShaderBufferBlock ssbos_[MTL_MAX_STORAGE_BUFFER_BINDINGS]; /* Textures. */ /* Textures support explicit binding indices, so some texture slots @@ -209,6 +214,10 @@ class MTLShaderInterface : public ShaderInterface { uint32_t buffer_index, uint32_t size, ShaderStage stage_mask = ShaderStage::ANY); + uint32_t add_storage_block(uint32_t name_offset, + uint32_t buffer_index, + uint32_t size, + ShaderStage stage_mask = ShaderStage::ANY); void add_uniform(uint32_t name_offset, eMTLDataType type, int array_len = 1); void add_texture(uint32_t name_offset, uint32_t texture_slot, @@ -232,14 +241,21 @@ class MTLShaderInterface : public ShaderInterface { uint32_t get_total_uniforms() const; /* Fetch Uniform Blocks. */ - const MTLShaderUniformBlock &get_uniform_block(uint index) const; + const MTLShaderBufferBlock &get_uniform_block(uint index) const; uint32_t get_total_uniform_blocks() const; uint32_t get_max_ubo_index() const; bool has_uniform_block(uint32_t block_index) const; uint32_t get_uniform_block_size(uint32_t block_index) const; + /* Fetch Storage Blocks. */ + const MTLShaderBufferBlock &get_storage_block(uint index) const; + uint32_t get_total_storage_blocks() const; + uint32_t get_max_ssbo_index() const; + bool has_storage_block(uint32_t block_index) const; + uint32_t get_storage_block_size(uint32_t block_index) const; + /* Push constant uniform data block should always be available. */ - const MTLShaderUniformBlock &get_push_constant_block() const; + const MTLShaderBufferBlock &get_push_constant_block() const; /* Fetch textures. */ const MTLShaderTexture &get_texture(uint index) const; diff --git a/source/blender/gpu/metal/mtl_shader_interface.mm b/source/blender/gpu/metal/mtl_shader_interface.mm index 5a66e541f3b..97f85046a3d 100644 --- a/source/blender/gpu/metal/mtl_shader_interface.mm +++ b/source/blender/gpu/metal/mtl_shader_interface.mm @@ -56,6 +56,8 @@ void MTLShaderInterface::init() total_attributes_ = 0; total_uniform_blocks_ = 0; max_uniformbuf_index_ = 0; + total_storage_blocks_ = 0; + max_storagebuf_index_ = 0; total_uniforms_ = 0; total_textures_ = 0; max_texture_index_ = -1; @@ -73,6 +75,9 @@ void MTLShaderInterface::init() for (const int ubo : IndexRange(GPU_NUM_UNIFORM_BLOCKS)) { builtin_blocks_[ubo] = -1; } + for (const int ssbo : IndexRange(GPU_NUM_STORAGE_BUFFERS)) { + builtin_buffers_[ssbo] = -1; + } for (const int tex : IndexRange(MTL_MAX_TEXTURE_SLOTS)) { textures_[tex].used = false; textures_[tex].slot_index = -1; @@ -117,7 +122,10 @@ uint32_t MTLShaderInterface::add_uniform_block(uint32_t name_offset, size += 16 - (size % 16); } - MTLShaderUniformBlock &uni_block = ubos_[total_uniform_blocks_]; + BLI_assert(total_uniform_blocks_ < MTL_MAX_UNIFORM_BUFFER_BINDINGS); + BLI_assert(buffer_index < MTL_MAX_STORAGE_BUFFER_BINDINGS); + + MTLShaderBufferBlock &uni_block = ubos_[total_uniform_blocks_]; uni_block.name_offset = name_offset; uni_block.buffer_index = buffer_index; uni_block.size = size; @@ -127,6 +135,29 @@ uint32_t MTLShaderInterface::add_uniform_block(uint32_t name_offset, return (total_uniform_blocks_++); } +uint32_t MTLShaderInterface::add_storage_block(uint32_t name_offset, + uint32_t buffer_index, + uint32_t size, + ShaderStage stage_mask) +{ + /* Ensure Size is 16 byte aligned to guarantees alignment rules are satisfied. */ + if ((size % 16) != 0) { + size += 16 - (size % 16); + } + + BLI_assert(total_storage_blocks_ < MTL_MAX_STORAGE_BUFFER_BINDINGS); + BLI_assert(buffer_index < MTL_MAX_STORAGE_BUFFER_BINDINGS); + + MTLShaderBufferBlock &ssbo_block = ssbos_[total_storage_blocks_]; + ssbo_block.name_offset = name_offset; + ssbo_block.buffer_index = buffer_index; + ssbo_block.size = size; + ssbo_block.current_offset = 0; + ssbo_block.stage_mask = ShaderStage::ANY; + max_storagebuf_index_ = max_ii(max_storagebuf_index_, buffer_index); + return (total_storage_blocks_++); +} + void MTLShaderInterface::add_push_constant_block(uint32_t name_offset) { push_constant_block_.name_offset = name_offset; @@ -227,6 +258,9 @@ void MTLShaderInterface::map_builtins() for (const int ubo : IndexRange(GPU_NUM_UNIFORM_BLOCKS)) { builtin_blocks_[ubo] = -1; } + for (const int ssbo : IndexRange(GPU_NUM_STORAGE_BUFFERS)) { + builtin_buffers_[ssbo] = -1; + } /* Resolve and cache uniform locations for builtin uniforms. */ for (const int u : IndexRange(GPU_NUM_UNIFORMS)) { @@ -257,6 +291,22 @@ void MTLShaderInterface::map_builtins() } } } + + /* Resolve and cache uniform locations for builtin storage buffers. */ + for (const int u : IndexRange(GPU_NUM_STORAGE_BUFFERS)) { + const ShaderInput *uni = this->ssbo_get( + builtin_storage_block_name((GPUStorageBufferBuiltin)u)); + + if (uni != nullptr) { + BLI_assert(uni->location >= 0); + if (uni->location >= 0) { + builtin_buffers_[u] = uni->binding; + MTL_LOG_INFO("Mapped builtin storage buffer '%s' to location %d\n", + builtin_storage_block_name((GPUStorageBufferBuiltin)u), + uni->location); + } + } + } } /* Populate #ShaderInput struct based on interface. */ @@ -272,9 +322,7 @@ void MTLShaderInterface::prepare_common_shader_inputs() attr_len_ = this->get_total_attributes(); ubo_len_ = this->get_total_uniform_blocks(); uniform_len_ = this->get_total_uniforms() + this->get_total_textures(); - - /* TODO(Metal): Support storage buffer bindings. Pending compute shader support. */ - ssbo_len_ = 0; + ssbo_len_ = this->get_total_storage_blocks(); /* Calculate total inputs and allocate #ShaderInput array. */ /* NOTE: We use the existing `name_buffer_` allocated for internal input structs. */ @@ -300,7 +348,7 @@ void MTLShaderInterface::prepare_common_shader_inputs() BLI_assert(&inputs_[attr_len_] >= current_input); current_input = &inputs_[attr_len_]; for (const int ubo_index : IndexRange(total_uniform_blocks_)) { - MTLShaderUniformBlock &shd_ubo = ubos_[ubo_index]; + MTLShaderBufferBlock &shd_ubo = ubos_[ubo_index]; current_input->name_offset = shd_ubo.name_offset; current_input->name_hash = BLI_hash_string(this->get_name_at_offset(shd_ubo.name_offset)); /* Location refers to the index in the ubos_ array. */ @@ -308,7 +356,8 @@ void MTLShaderInterface::prepare_common_shader_inputs() /* Binding location refers to the UBO bind slot in * #MTLContextGlobalShaderPipelineState::ubo_bindings. The buffer bind index [[buffer(N)]] * within the shader will apply an offset for bound vertex buffers and the default uniform - * PushConstantBlock. */ + * PushConstantBlock. + * see `mtl_shader_generator.hh` for buffer binding table breakdown. */ current_input->binding = shd_ubo.buffer_index; current_input++; } @@ -357,10 +406,24 @@ void MTLShaderInterface::prepare_common_shader_inputs() } } - /* SSBO bindings. - * TODO(Metal): Support SSBOs. Pending compute support. */ + /* SSBO bindings. */ BLI_assert(&inputs_[attr_len_ + ubo_len_ + uniform_len_] >= current_input); current_input = &inputs_[attr_len_ + ubo_len_ + uniform_len_]; + BLI_assert(ssbo_len_ >= total_storage_blocks_); + for (const int ssbo_index : IndexRange(total_storage_blocks_)) { + MTLShaderBufferBlock &shd_ssbo = ssbos_[ssbo_index]; + current_input->name_offset = shd_ssbo.name_offset; + current_input->name_hash = BLI_hash_string(this->get_name_at_offset(shd_ssbo.name_offset)); + /* Location refers to the index in the ssbos_ array. */ + current_input->location = ssbo_index; + /* Binding location refers to the SSBO bind slot in + * #MTLContextGlobalShaderPipelineState::ssbo_bindings. The buffer bind index [[buffer(N)]] + * within the shader will apply an offset for bound vertex buffers and the default uniform + * PushConstantBlock after other uniform blocks + * see `mtl_shader_generator.hh` for buffer binding table breakdown. */ + current_input->binding = shd_ssbo.buffer_index; + current_input++; + } /* Map builtin uniform indices to uniform binding locations. */ this->map_builtins(); @@ -417,14 +480,14 @@ uint32_t MTLShaderInterface::get_total_uniforms() const } /* Uniform Blocks. */ -const MTLShaderUniformBlock &MTLShaderInterface::get_uniform_block(uint index) const +const MTLShaderBufferBlock &MTLShaderInterface::get_uniform_block(uint index) const { BLI_assert(index < MTL_MAX_UNIFORM_BUFFER_BINDINGS); BLI_assert(index < get_total_uniform_blocks()); return ubos_[index]; } -const MTLShaderUniformBlock &MTLShaderInterface::get_push_constant_block() const +const MTLShaderBufferBlock &MTLShaderInterface::get_push_constant_block() const { return push_constant_block_; } @@ -449,6 +512,33 @@ uint32_t MTLShaderInterface::get_uniform_block_size(uint32_t block_index) const return (block_index < total_uniform_blocks_) ? ubos_[block_index].size : 0; } +/* Storage Blocks. */ +const MTLShaderBufferBlock &MTLShaderInterface::get_storage_block(uint index) const +{ + BLI_assert(index < MTL_MAX_STORAGE_BUFFER_BINDINGS); + BLI_assert(index < get_total_storage_blocks()); + return ssbos_[index]; +} +uint32_t MTLShaderInterface::get_total_storage_blocks() const +{ + return total_storage_blocks_; +} + +uint32_t MTLShaderInterface::get_max_ssbo_index() const +{ + return max_storagebuf_index_; +} + +bool MTLShaderInterface::has_storage_block(uint32_t block_index) const +{ + return (block_index < total_storage_blocks_); +} + +uint32_t MTLShaderInterface::get_storage_block_size(uint32_t block_index) const +{ + return (block_index < total_storage_blocks_) ? ssbos_[block_index].size : 0; +} + /* Textures. */ const MTLShaderTexture &MTLShaderInterface::get_texture(uint index) const { diff --git a/source/blender/gpu/metal/mtl_storage_buffer.hh b/source/blender/gpu/metal/mtl_storage_buffer.hh new file mode 100644 index 00000000000..d57099a5aa0 --- /dev/null +++ b/source/blender/gpu/metal/mtl_storage_buffer.hh @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +/** \file + * \ingroup gpu + */ + +#pragma once + +#include "MEM_guardedalloc.h" +#include "gpu_storage_buffer_private.hh" + +#include "mtl_context.hh" + +namespace blender { +namespace gpu { + +class MTLUniformBuf; +class MTLVertBuf; +class MTLIndexBuf; + +/** + * Implementation of Storage Buffers using Metal. + */ +class MTLStorageBuf : public StorageBuf { + private: + /** Allocation Handle or indirect wrapped instance. + * MTLStorageBuf can wrap a MTLVertBuf, MTLIndexBuf or MTLUniformBuf for binding as a writeable + * resource. */ + enum { + MTL_STORAGE_BUF_TYPE_DEFAULT = 0, + MTL_STORAGE_BUF_TYPE_UNIFORMBUF = 1, + MTL_STORAGE_BUF_TYPE_VERTBUF = 2, + MTL_STORAGE_BUF_TYPE_INDEXBUF = 3, + } storage_source_ = MTL_STORAGE_BUF_TYPE_DEFAULT; + + union { + /* Own alloation. */ + gpu::MTLBuffer *metal_buffer_; + /* Wrapped type. */ + MTLUniformBuf *uniform_buffer_; + MTLVertBuf *vertex_buffer_; + MTLIndexBuf *index_buffer_; + }; + + /* Whether buffer has contents, if false, no GPU buffer will + * have yet been allocated. */ + bool has_data_ = false; + /** Bind-state tracking. */ + int bind_slot_ = -1; + MTLContext *bound_ctx_ = nullptr; + + /** Usage type. */ + GPUUsageType usage_; + + public: + MTLStorageBuf(size_t size, GPUUsageType usage, const char *name); + ~MTLStorageBuf(); + + MTLStorageBuf(MTLUniformBuf *uniform_buf, size_t size); + MTLStorageBuf(MTLVertBuf *uniform_buf, size_t size); + MTLStorageBuf(MTLIndexBuf *uniform_buf, size_t size); + + void update(const void *data) override; + void bind(int slot) override; + void unbind() override; + void clear(eGPUTextureFormat internal_format, eGPUDataFormat data_format, void *data) override; + void copy_sub(VertBuf *src, uint dst_offset, uint src_offset, uint copy_size) override; + void read(void *data) override; + + void init(); + + id get_metal_buffer(); + int get_size(); + const char *get_name() + { + return name_; + } + + private: + MEM_CXX_CLASS_ALLOC_FUNCS("MTLStorageBuf"); +}; + +} // namespace gpu +} // namespace blender diff --git a/source/blender/gpu/metal/mtl_storage_buffer.mm b/source/blender/gpu/metal/mtl_storage_buffer.mm new file mode 100644 index 00000000000..07b05beb5eb --- /dev/null +++ b/source/blender/gpu/metal/mtl_storage_buffer.mm @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +/** \file + * \ingroup gpu + */ + +#include "BLI_string.h" + +#include "gpu_backend.hh" +#include "gpu_context_private.hh" + +#include "mtl_backend.hh" +#include "mtl_context.hh" +#include "mtl_debug.hh" +#include "mtl_index_buffer.hh" +#include "mtl_storage_buffer.hh" +#include "mtl_uniform_buffer.hh" +#include "mtl_vertex_buffer.hh" + +namespace blender::gpu { + +/* -------------------------------------------------------------------- */ +/** \name Creation & Deletion + * \{ */ + +MTLStorageBuf::MTLStorageBuf(size_t size, GPUUsageType usage, const char *name) + : StorageBuf(size, name) +{ + usage_ = usage; + /* Do not create SSBO MTL buffer here to allow allocation from any thread. */ + storage_source_ = MTL_STORAGE_BUF_TYPE_DEFAULT; + metal_buffer_ = nullptr; +} + +MTLStorageBuf::MTLStorageBuf(MTLUniformBuf *uniform_buf, size_t size) + : StorageBuf(size, "UniformBuffer_as_SSBO") +{ + usage_ = GPU_USAGE_DYNAMIC; + storage_source_ = MTL_STORAGE_BUF_TYPE_UNIFORMBUF; + uniform_buffer_ = uniform_buf; + BLI_assert(uniform_buffer_ != nullptr); +} + +MTLStorageBuf::MTLStorageBuf(MTLVertBuf *vert_buf, size_t size) + : StorageBuf(size, "VertexBuffer_as_SSBO") +{ + usage_ = GPU_USAGE_DYNAMIC; + storage_source_ = MTL_STORAGE_BUF_TYPE_VERTBUF; + vertex_buffer_ = vert_buf; + BLI_assert(vertex_buffer_ != nullptr); +} + +MTLStorageBuf::MTLStorageBuf(MTLIndexBuf *index_buf, size_t size) + : StorageBuf(size, "IndexBuffer_as_SSBO") +{ + usage_ = GPU_USAGE_DYNAMIC; + storage_source_ = MTL_STORAGE_BUF_TYPE_INDEXBUF; + index_buffer_ = index_buf; + BLI_assert(index_buffer_ != nullptr); +} + +MTLStorageBuf::~MTLStorageBuf() +{ + if (storage_source_ == MTL_STORAGE_BUF_TYPE_DEFAULT) { + if (metal_buffer_ != nullptr) { + metal_buffer_->free(); + metal_buffer_ = nullptr; + } + has_data_ = false; + } + + /* Ensure SSBO is not bound to active CTX. + * SSBO bindings are reset upon Context-switch so we do not need + * to check deactivated context's. */ + MTLContext *ctx = MTLContext::get(); + if (ctx) { + for (int i = 0; i < MTL_MAX_STORAGE_BUFFER_BINDINGS; i++) { + MTLStorageBufferBinding &slot = ctx->pipeline_state.ssbo_bindings[i]; + if (slot.bound && slot.ssbo == this) { + slot.bound = false; + slot.ssbo = nullptr; + } + } + } +} + +/** \} */ + +/* -------------------------------------------------------------------- */ +/** \name Data upload / update + * \{ */ + +void MTLStorageBuf::init() +{ + /* We only need to initialize the storage buffer for default buffer types. */ + if (storage_source_ != MTL_STORAGE_BUF_TYPE_DEFAULT) { + return; + } + BLI_assert(this); + BLI_assert(size_in_bytes_ > 0); + + /* Allocate MTL buffer */ + MTLContext *ctx = static_cast(unwrap(GPU_context_active_get())); + BLI_assert(ctx); + BLI_assert(ctx->device); + UNUSED_VARS_NDEBUG(ctx); + + metal_buffer_ = MTLContext::get_global_memory_manager()->allocate(size_in_bytes_, true); + +#ifndef NDEBUG + metal_buffer_->set_label([NSString stringWithFormat:@"Storage Buffer %s", name_]); +#endif + BLI_assert(metal_buffer_ != nullptr); + BLI_assert(metal_buffer_->get_metal_buffer() != nil); + + has_data_ = false; +} + +void MTLStorageBuf::update(const void *data) +{ + /* We only need to initialize the storage buffer for default buffer types. */ + if (storage_source_ != MTL_STORAGE_BUF_TYPE_DEFAULT) { + return; + } + + /* Ensure buffer has been allocated. */ + if (metal_buffer_ == nullptr) { + init(); + } + + BLI_assert(data != nullptr); + if (data != nullptr) { + /* Upload data. */ + BLI_assert(data != nullptr); + BLI_assert(!(metal_buffer_->get_resource_options() & MTLResourceStorageModePrivate)); + BLI_assert(size_in_bytes_ <= metal_buffer_->get_size()); + BLI_assert(size_in_bytes_ <= [metal_buffer_->get_metal_buffer() length]); + memcpy(metal_buffer_->get_host_ptr(), data, size_in_bytes_); + metal_buffer_->flush_range(0, size_in_bytes_); + has_data_ = true; + } +} + +/** \} */ + +/* -------------------------------------------------------------------- */ +/** \name Usage + * \{ */ + +void MTLStorageBuf::bind(int slot) +{ + if (slot >= MTL_MAX_STORAGE_BUFFER_BINDINGS) { + fprintf( + stderr, + "Error: Trying to bind \"%s\" ssbo to slot %d which is above the reported limit of %d.\n", + name_, + slot, + MTL_MAX_STORAGE_BUFFER_BINDINGS); + BLI_assert(false); + return; + } + + if (metal_buffer_ == nullptr) { + this->init(); + } + + if (data_ != nullptr) { + this->update(data_); + MEM_SAFE_FREE(data_); + } + + /* Bind current UBO to active context. */ + MTLContext *ctx = MTLContext::get(); + BLI_assert(ctx); + + MTLStorageBufferBinding &ctx_ssbo_bind_slot = ctx->pipeline_state.ssbo_bindings[slot]; + ctx_ssbo_bind_slot.ssbo = this; + ctx_ssbo_bind_slot.bound = true; + + bind_slot_ = slot; + bound_ctx_ = ctx; +} + +void MTLStorageBuf::unbind() +{ + /* Unbind in debug mode to validate missing binds. + * Otherwise, only perform a full unbind upon destruction + * to ensure no lingering references. */ +#ifndef NDEBUG + if (true) { +#else + if (G.debug & G_DEBUG_GPU) { +#endif + if (bound_ctx_ != nullptr && bind_slot_ > -1) { + MTLStorageBufferBinding &ctx_ssbo_bind_slot = + bound_ctx_->pipeline_state.ssbo_bindings[bind_slot_]; + if (ctx_ssbo_bind_slot.bound && ctx_ssbo_bind_slot.ssbo == this) { + ctx_ssbo_bind_slot.bound = false; + ctx_ssbo_bind_slot.ssbo = nullptr; + } + } + } + + /* Reset bind index. */ + bind_slot_ = -1; + bound_ctx_ = nullptr; +} + +void MTLStorageBuf::clear(eGPUTextureFormat internal_format, + eGPUDataFormat data_format, + void *data) +{ + /* Fetch active context. */ + MTLContext *ctx = static_cast(unwrap(GPU_context_active_get())); + BLI_assert(ctx); + + if (metal_buffer_ == nullptr) { + this->init(); + } + + if (ctx) { + /* Fast clear. */ + id blit_encoder = ctx->main_command_buffer.ensure_begin_blit_encoder(); + [blit_encoder fillBuffer:metal_buffer_->get_metal_buffer() + range:NSMakeRange(0, size_in_bytes_) + value:0]; + } + else { + /* Fallback inefficient clear if outside of render context. */ + void *clear_data = calloc(1, size_in_bytes_); + this->update(clear_data); + free(clear_data); + } +} + +void MTLStorageBuf::copy_sub(VertBuf *src_, uint dst_offset, uint src_offset, uint copy_size) +{ + /* TODO(Metal): Support Copy sub operation. */ + MTL_LOG_WARNING("TLStorageBuf::copy_sub not yet supported.\n"); +} + +void MTLStorageBuf::read(void *data) +{ + if (data == nullptr) { + return; + } + + if (metal_buffer_ == nullptr) { + this->init(); + } + + /* Managed buffers need to be explicitly flushed back to host. */ + if (metal_buffer_->get_resource_options() & MTLResourceStorageModeManaged) { + /* Fetch active context. */ + MTLContext *ctx = static_cast(unwrap(GPU_context_active_get())); + BLI_assert(ctx); + + /* Ensure GPU updates are flushed back to CPU. */ + id blit_encoder = ctx->main_command_buffer.ensure_begin_blit_encoder(); + [blit_encoder synchronizeResource:metal_buffer_->get_metal_buffer()]; + + /* Ensure sync has occured. */ + GPU_finish(); + } + + /* Read data. NOTE: Unless explicitly synchronized with GPU work, results may not be ready. */ + memcpy(data, metal_buffer_->get_host_ptr(), size_in_bytes_); +} + +id MTLStorageBuf::get_metal_buffer() +{ + + gpu::MTLBuffer *source_buffer = nullptr; + switch (storage_source_) { + /* Default SSBO buffer comes from own allocation. */ + case MTL_STORAGE_BUF_TYPE_DEFAULT: { + if (metal_buffer_ == nullptr) { + this->init(); + } + + if (data_ != nullptr) { + this->update(data_); + MEM_SAFE_FREE(data_); + } + source_buffer = metal_buffer_; + } break; + /* SSBO buffer comes from Uniform Buffer. */ + case MTL_STORAGE_BUF_TYPE_UNIFORMBUF: { + source_buffer = uniform_buffer_->metal_buffer_; + } break; + /* SSBO buffer comes from Vertex Buffer. */ + case MTL_STORAGE_BUF_TYPE_VERTBUF: { + source_buffer = vertex_buffer_->vbo_; + } break; + /* SSBO buffer comes from Index Buffer. */ + case MTL_STORAGE_BUF_TYPE_INDEXBUF: { + source_buffer = index_buffer_->ibo_; + } break; + } + + /* Return Metal allocation handle and flag as used. */ + BLI_assert(source_buffer != nullptr); + source_buffer->debug_ensure_used(); + return source_buffer->get_metal_buffer(); +} + +int MTLStorageBuf::get_size() +{ + BLI_assert(this); + return size_in_bytes_; +} + +} // blender::gpu diff --git a/source/blender/gpu/metal/mtl_uniform_buffer.hh b/source/blender/gpu/metal/mtl_uniform_buffer.hh index b2250fc28d4..8d350a1b96c 100644 --- a/source/blender/gpu/metal/mtl_uniform_buffer.hh +++ b/source/blender/gpu/metal/mtl_uniform_buffer.hh @@ -13,10 +13,14 @@ namespace blender::gpu { +class MTLStorageBuf; + /** * Implementation of Uniform Buffers using Metal. **/ class MTLUniformBuf : public UniformBuf { + friend class MTLStorageBuf; /* For bind as SSBO resource access. */ + private: /* Allocation Handle. */ gpu::MTLBuffer *metal_buffer_ = nullptr; @@ -29,6 +33,9 @@ class MTLUniformBuf : public UniformBuf { int bind_slot_ = -1; MTLContext *bound_ctx_ = nullptr; + /* SSBO wrapper for bind_as_ssbo support. */ + MTLStorageBuf *ssbo_wrapper_ = nullptr; + public: MTLUniformBuf(size_t size, const char *name); ~MTLUniformBuf(); diff --git a/source/blender/gpu/metal/mtl_uniform_buffer.mm b/source/blender/gpu/metal/mtl_uniform_buffer.mm index ca209f86790..c6963601254 100644 --- a/source/blender/gpu/metal/mtl_uniform_buffer.mm +++ b/source/blender/gpu/metal/mtl_uniform_buffer.mm @@ -14,6 +14,7 @@ #include "mtl_backend.hh" #include "mtl_context.hh" #include "mtl_debug.hh" +#include "mtl_storage_buffer.hh" #include "mtl_uniform_buffer.hh" namespace blender::gpu { @@ -43,6 +44,11 @@ MTLUniformBuf::~MTLUniformBuf() } } } + + if (ssbo_wrapper_) { + delete ssbo_wrapper_; + ssbo_wrapper_ = nullptr; + } } void MTLUniformBuf::update(const void *data) @@ -128,7 +134,25 @@ void MTLUniformBuf::bind_as_ssbo(int slot) return; } - BLI_assert_msg(0, "Not implemented yet"); + /* We need to ensure data is actually allocated if using as an SSBO, as resource may be written + * to. */ + if (metal_buffer_ == nullptr) { + /* Check if we have any deferred data to upload. */ + if (data_ != nullptr) { + this->update(data_); + MEM_SAFE_FREE(data_); + } + else { + this->clear_to_zero(); + } + } + + /* Create MTLStorageBuffer to wrap this resource and use conventional binding. */ + if (ssbo_wrapper_ == nullptr) { + ssbo_wrapper_ = new MTLStorageBuf(this, size_in_bytes_); + } + + ssbo_wrapper_->bind(slot); } void MTLUniformBuf::unbind() diff --git a/source/blender/gpu/metal/mtl_vertex_buffer.hh b/source/blender/gpu/metal/mtl_vertex_buffer.hh index 056e2062ab1..05a896919b6 100644 --- a/source/blender/gpu/metal/mtl_vertex_buffer.hh +++ b/source/blender/gpu/metal/mtl_vertex_buffer.hh @@ -22,7 +22,8 @@ class MTLVertBuf : public VertBuf { friend class gpu::MTLTexture; /* For buffer texture. */ friend class MTLShader; /* For transform feedback. */ friend class MTLBatch; - friend class MTLContext; /* For transform feedback. */ + friend class MTLContext; /* For transform feedback. */ + friend class MTLStorageBuf; /* For bind as SSBO resource access. */ private: /** Metal buffer allocation. **/ @@ -37,6 +38,8 @@ class MTLVertBuf : public VertBuf { uint64_t alloc_size_ = 0; /** Whether existing allocation has been submitted for use by the GPU. */ bool contents_in_flight_ = false; + /* SSBO wrapper for bind_as_ssbo support. */ + MTLStorageBuf *ssbo_wrapper_ = nullptr; /* Fetch Metal buffer and offset into allocation if necessary. * Access limited to friend classes. */ diff --git a/source/blender/gpu/metal/mtl_vertex_buffer.mm b/source/blender/gpu/metal/mtl_vertex_buffer.mm index 6114e1057c8..459c8188225 100644 --- a/source/blender/gpu/metal/mtl_vertex_buffer.mm +++ b/source/blender/gpu/metal/mtl_vertex_buffer.mm @@ -5,6 +5,7 @@ */ #include "mtl_vertex_buffer.hh" #include "mtl_debug.hh" +#include "mtl_storage_buffer.hh" namespace blender::gpu { @@ -50,6 +51,11 @@ void MTLVertBuf::release_data() GPU_TEXTURE_FREE_SAFE(buffer_texture_); MEM_SAFE_FREE(data); + + if (ssbo_wrapper_) { + delete ssbo_wrapper_; + ssbo_wrapper_ = nullptr; + } } void MTLVertBuf::duplicate_data(VertBuf *dst_) @@ -294,10 +300,16 @@ void MTLVertBuf::update_sub(uint start, uint len, const void *data) void MTLVertBuf::bind_as_ssbo(uint binding) { - /* TODO(Metal): Support binding of buffers as SSBOs. - * Pending overall compute support for Metal backend. */ - MTL_LOG_WARNING("MTLVertBuf::bind_as_ssbo not yet implemented!\n"); this->flag_used(); + + /* Ensure resource is initialized. */ + this->bind(); + + /* Create MTLStorageBuffer to wrap this resource and use conventional binding. */ + if (ssbo_wrapper_ == nullptr) { + ssbo_wrapper_ = new MTLStorageBuf(this, alloc_size_); + } + ssbo_wrapper_->bind(binding); } void MTLVertBuf::bind_as_texture(uint binding) diff --git a/source/blender/gpu/shaders/metal/mtl_shader_defines.msl b/source/blender/gpu/shaders/metal/mtl_shader_defines.msl index 5cb9c47f36f..20ab79dc183 100644 --- a/source/blender/gpu/shaders/metal/mtl_shader_defines.msl +++ b/source/blender/gpu/shaders/metal/mtl_shader_defines.msl @@ -97,10 +97,18 @@ struct constexp_uvec3 { return 0; } } - inline operator uint3() const + constexpr inline operator uint3() const { return xyz; } + constexpr inline operator uint2() const + { + return xy; + } + constexpr inline operator uint() const + { + return x; + } }; constexpr constexp_uvec3 __internal_workgroupsize_get() @@ -136,6 +144,10 @@ template T atomicSub(threadgroup T &mem, T data) { return atomic_fetch_sub_explicit((threadgroup _atomic *)&mem, data, memory_order_relaxed); } +template T atomicAnd(threadgroup T &mem, T data) +{ + return atomic_fetch_and_explicit((threadgroup _atomic *)&mem, data, memory_order_relaxed); +} template T atomicOr(threadgroup T &mem, T data) { return atomic_fetch_or_explicit((threadgroup _atomic *)&mem, data, memory_order_relaxed); @@ -148,33 +160,40 @@ template T atomicXor(threadgroup T &mem, T data) /* Device memory. */ template T atomicMax(device T &mem, T data) { - return atomic_fetch_max_explicit((threadgroup _atomic *)&mem, data, memory_order_relaxed); + return atomic_fetch_max_explicit((device _atomic *)&mem, data, memory_order_relaxed); } template T atomicMin(device T &mem, T data) { - return atomic_fetch_min_explicit((threadgroup _atomic *)&mem, data, memory_order_relaxed); + return atomic_fetch_min_explicit((device _atomic *)&mem, data, memory_order_relaxed); } template T atomicAdd(device T &mem, T data) { - return atomic_fetch_add_explicit((threadgroup _atomic *)&mem, data, memory_order_relaxed); + return atomic_fetch_add_explicit((device _atomic *)&mem, data, memory_order_relaxed); } template T atomicSub(device T &mem, T data) { - return atomic_fetch_sub_explicit((threadgroup _atomic *)&mem, data, memory_order_relaxed); + return atomic_fetch_sub_explicit((device _atomic *)&mem, data, memory_order_relaxed); +} +template T atomicAnd(device T &mem, T data) +{ + return atomic_fetch_and_explicit((device _atomic *)&mem, data, memory_order_relaxed); } template T atomicOr(device T &mem, T data) { - return atomic_fetch_or_explicit((threadgroup _atomic *)&mem, data, memory_order_relaxed); + return atomic_fetch_or_explicit((device _atomic *)&mem, data, memory_order_relaxed); } template T atomicXor(device T &mem, T data) { - return atomic_fetch_xor_explicit((threadgroup _atomic *)&mem, data, memory_order_relaxed); + return atomic_fetch_xor_explicit((device _atomic *)&mem, data, memory_order_relaxed); } /* Used to replace 'out' in function parameters with threadlocal reference * shortened to avoid expanding the glsl source string. */ #define THD thread #define OUT(type, name, array) thread type(&name)[array] +#define THREADGROUP_OUT_ARRAY(type, name, array) threadgroup type(&name)[array] +#define DEVICE_OUT_ARRAY(type, name, array) device type(&name)[array] +#define DEVICE_OUT(type, name) device type &name /* Generate wrapper structs for combined texture and sampler type. */ #ifdef USE_ARGUMENT_BUFFER_FOR_SAMPLERS @@ -1122,6 +1141,27 @@ inline float4 uintBitsToFloat(uint4 f) return as_type(f); } +#define bitfieldReverse reverse_bits +#define bitfieldExtract extract_bits +#define bitfieldInsert insert_bits +#define bitCount popcount + +template T findLSB(T x) +{ + /* ctz returns the number of trailing zeroes. To fetch the index of the LSB, we can also use this + * value as index, however need to filter out the case where the input value is zero to match + * GLSL functionality. */ + return (x == T(0)) ? T(-1) : T(ctz(x)); +} + +template T findMSB(T x) +{ + /* clz returns the number of leading zeroes. To fetch the index of the LSB, we can also use this + * value as index when offset by 1. however need to filter out the case where the input value is + * zero to match GLSL functionality. 000000010*/ + return (x == T(0)) ? T(-1) : (clz(T(0)) - clz(x) - T(1)); +} + /* Texture size functions. Add texture types as needed. */ #define imageSize(image) textureSize(image, 0) diff --git a/source/blender/gpu/shaders/opengl/glsl_shader_defines.glsl b/source/blender/gpu/shaders/opengl/glsl_shader_defines.glsl index f2d972ea574..e8119c57d6c 100644 --- a/source/blender/gpu/shaders/opengl/glsl_shader_defines.glsl +++ b/source/blender/gpu/shaders/opengl/glsl_shader_defines.glsl @@ -15,6 +15,14 @@ #define depthCubeArray samplerCubeArray #define depth2DArrayShadow sampler2DArrayShadow +/* Memory scope and pass by reference types. + * NOTE: These are required by Metal, but are not required in all cases by GLSL. */ +#define device +#define threadgroup +#define OUT(type, name, array_len) out type name[array_len] +#define DEVICE_OUT_ARRAY(type, name, array_len) out type name[array_len] +#define DEVICE_OUT(type, name) out type + /* Backend Functions. */ #define select(A, B, mask) mix(A, B, mask)