WIP: uv-simple-select #1

Closed
Chris Blackbourn wants to merge 182 commits from uv-simple-select into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
21 changed files with 800 additions and 102 deletions
Showing only changes of commit b47c938af2 - Show all commits

View File

@ -197,6 +197,7 @@ void DRW_gpu_render_context_enable(void *re_gpu_context);
void DRW_gpu_render_context_disable(void *re_gpu_context);
void DRW_deferred_shader_remove(struct GPUMaterial *mat);
void DRW_deferred_shader_optimize_remove(struct GPUMaterial *mat);
/**
* Get DrawData from the given ID-block. In order for this to work, we assume that

View File

@ -53,6 +53,7 @@ static void eevee_engine_init(void *ved)
stl->g_data->valid_double_buffer = (txl->color_double_buffer != NULL);
stl->g_data->valid_taa_history = (txl->taa_history != NULL);
stl->g_data->queued_shaders_count = 0;
stl->g_data->queued_optimise_shaders_count = 0;
stl->g_data->render_timesteps = 1;
stl->g_data->disable_ligthprobes = v3d &&
(v3d->object_type_exclude_viewport & (1 << OB_LIGHTPROBE));
@ -178,6 +179,11 @@ static void eevee_cache_finish(void *vedata)
if (g_data->queued_shaders_count > 0) {
SNPRINTF(ved->info, TIP_("Compiling Shaders (%d remaining)"), g_data->queued_shaders_count);
}
else if (g_data->queued_optimise_shaders_count > 0) {
SNPRINTF(ved->info,
TIP_("Optimizing Shaders (%d remaining)"),
g_data->queued_optimise_shaders_count);
}
}
/* As renders in an HDR off-screen buffer, we need draw everything once

View File

@ -1000,6 +1000,8 @@ typedef struct EEVEE_PrivateData {
/* Compiling shaders count. This is to track if a shader has finished compiling. */
int queued_shaders_count;
int queued_shaders_count_prev;
/* Optimising shaders count. */
int queued_optimise_shaders_count;
/* LookDev Settings */
int studiolight_index;

View File

@ -1390,12 +1390,21 @@ struct GPUMaterial *EEVEE_material_get(
return nullptr;
}
switch (status) {
case GPU_MAT_SUCCESS:
break;
case GPU_MAT_QUEUED:
case GPU_MAT_SUCCESS: {
/* Determine optimization status for remaining compilations counter. */
int optimization_status = GPU_material_optimization_status(mat);
if (optimization_status == GPU_MAT_OPTIMIZATION_QUEUED) {
vedata->stl->g_data->queued_optimise_shaders_count++;
}
} break;
case GPU_MAT_QUEUED: {
vedata->stl->g_data->queued_shaders_count++;
mat = EEVEE_material_default_get(scene, ma, options);
break;
GPUMaterial *default_mat = EEVEE_material_default_get(scene, ma, options);
/* Mark pending material with its default material for future cache warming.*/
GPU_material_set_default(mat, default_mat);
/* Return default material. */
mat = default_mat;
} break;
case GPU_MAT_FAILED:
default:
ma = EEVEE_material_default_error_get();

View File

@ -507,6 +507,8 @@ GPUMaterial *ShaderModule::material_shader_get(const char *name,
this);
GPU_material_status_set(gpumat, GPU_MAT_QUEUED);
GPU_material_compile(gpumat);
/* Queue deferred material optimization. */
DRW_shader_queue_optimize_material(gpumat);
return gpumat;
}

View File

@ -307,6 +307,7 @@ struct GPUMaterial *DRW_shader_from_material(struct Material *ma,
bool deferred,
GPUCodegenCallbackFn callback,
void *thunk);
void DRW_shader_queue_optimize_material(struct GPUMaterial *mat);
void DRW_shader_free(struct GPUShader *shader);
#define DRW_SHADER_FREE_SAFE(shader) \
do { \

View File

@ -55,6 +55,9 @@ typedef struct DRWShaderCompiler {
ListBase queue; /* GPUMaterial */
SpinLock list_lock;
/** Optimization queue. */
ListBase optimize_queue; /* GPUMaterial */
void *gl_context;
GPUContext *gpu_context;
bool own_context;
@ -110,8 +113,29 @@ static void drw_deferred_shader_compilation_exec(
MEM_freeN(link);
}
else {
/* No more materials to optimize, or shaders to compile. */
break;
/* Check for Material Optimization job once there are no more
* shaders to compile. */
BLI_spin_lock(&comp->list_lock);
/* Pop tail because it will be less likely to lock the main thread
* if all GPUMaterials are to be freed (see DRW_deferred_shader_remove()). */
LinkData *link = (LinkData *)BLI_poptail(&comp->optimize_queue);
GPUMaterial *optimize_mat = link ? (GPUMaterial *)link->data : NULL;
if (optimize_mat) {
/* Avoid another thread freeing the material during optimization. */
GPU_material_acquire(optimize_mat);
}
BLI_spin_unlock(&comp->list_lock);
if (optimize_mat) {
/* Compile optimized material shader. */
GPU_material_optimize(optimize_mat);
GPU_material_release(optimize_mat);
MEM_freeN(link);
}
else {
/* No more materials to optimize, or shaders to compile. */
break;
}
}
if (GPU_type_matches_ex(GPU_DEVICE_ANY, GPU_OS_ANY, GPU_DRIVER_ANY, GPU_BACKEND_OPENGL)) {
@ -133,6 +157,7 @@ static void drw_deferred_shader_compilation_free(void *custom_data)
BLI_spin_lock(&comp->list_lock);
BLI_freelistN(&comp->queue);
BLI_freelistN(&comp->optimize_queue);
BLI_spin_unlock(&comp->list_lock);
if (comp->own_context) {
@ -148,34 +173,13 @@ static void drw_deferred_shader_compilation_free(void *custom_data)
MEM_freeN(comp);
}
static void drw_deferred_shader_add(GPUMaterial *mat, bool deferred)
/**
* Append either shader compilation or optimization job to deferred queue and
* ensure shader compilation worker is active.
* We keep two separate queue's to ensure core compilations always complete before optimization.
*/
static void drw_deferred_queue_append(GPUMaterial *mat, bool is_optimization_job)
{
if (ELEM(GPU_material_status(mat), GPU_MAT_SUCCESS, GPU_MAT_FAILED)) {
return;
}
/* Do not defer the compilation if we are rendering for image.
* deferred rendering is only possible when `evil_C` is available */
if (DST.draw_ctx.evil_C == NULL || DRW_state_is_image_render() || !USE_DEFERRED_COMPILATION) {
deferred = false;
}
if (!deferred) {
DRW_deferred_shader_remove(mat);
/* Shaders could already be compiling. Have to wait for compilation to finish. */
while (GPU_material_status(mat) == GPU_MAT_QUEUED) {
PIL_sleep_ms(20);
}
if (GPU_material_status(mat) == GPU_MAT_CREATED) {
GPU_material_compile(mat);
}
return;
}
/* Don't add material to the queue twice. */
if (GPU_material_status(mat) == GPU_MAT_QUEUED) {
return;
}
const bool use_main_context = GPU_use_main_context_workaround();
const bool job_own_context = !use_main_context;
@ -196,6 +200,7 @@ static void drw_deferred_shader_add(GPUMaterial *mat, bool deferred)
if (old_comp) {
BLI_spin_lock(&old_comp->list_lock);
BLI_movelisttolist(&comp->queue, &old_comp->queue);
BLI_movelisttolist(&comp->optimize_queue, &old_comp->optimize_queue);
BLI_spin_unlock(&old_comp->list_lock);
/* Do not recreate context, just pass ownership. */
if (old_comp->gl_context) {
@ -206,9 +211,18 @@ static void drw_deferred_shader_add(GPUMaterial *mat, bool deferred)
}
}
GPU_material_status_set(mat, GPU_MAT_QUEUED);
LinkData *node = BLI_genericNodeN(mat);
BLI_addtail(&comp->queue, node);
/* Add to either compilation or optimization queue. */
if (is_optimization_job) {
BLI_assert(GPU_material_optimization_status(mat) != GPU_MAT_OPTIMIZATION_QUEUED);
GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_QUEUED);
LinkData *node = BLI_genericNodeN(mat);
BLI_addtail(&comp->optimize_queue, node);
}
else {
GPU_material_status_set(mat, GPU_MAT_QUEUED);
LinkData *node = BLI_genericNodeN(mat);
BLI_addtail(&comp->queue, node);
}
/* Create only one context. */
if (comp->gl_context == NULL) {
@ -237,6 +251,39 @@ static void drw_deferred_shader_add(GPUMaterial *mat, bool deferred)
WM_jobs_start(wm, wm_job);
}
static void drw_deferred_shader_add(GPUMaterial *mat, bool deferred)
{
if (ELEM(GPU_material_status(mat), GPU_MAT_SUCCESS, GPU_MAT_FAILED)) {
return;
}
/* Do not defer the compilation if we are rendering for image.
* deferred rendering is only possible when `evil_C` is available */
if (DST.draw_ctx.evil_C == NULL || DRW_state_is_image_render() || !USE_DEFERRED_COMPILATION) {
deferred = false;
}
if (!deferred) {
DRW_deferred_shader_remove(mat);
/* Shaders could already be compiling. Have to wait for compilation to finish. */
while (GPU_material_status(mat) == GPU_MAT_QUEUED) {
PIL_sleep_ms(20);
}
if (GPU_material_status(mat) == GPU_MAT_CREATED) {
GPU_material_compile(mat);
}
return;
}
/* Don't add material to the queue twice. */
if (GPU_material_status(mat) == GPU_MAT_QUEUED) {
return;
}
/* Add deferred shader compilation to queue. */
drw_deferred_queue_append(mat, false);
}
static void drw_register_shader_vlattrs(GPUMaterial *mat)
{
const ListBase *attrs = GPU_material_layer_attributes(mat);
@ -288,9 +335,42 @@ void DRW_deferred_shader_remove(GPUMaterial *mat)
BLI_remlink(&comp->queue, link);
GPU_material_status_set(link->data, GPU_MAT_CREATED);
}
BLI_spin_unlock(&comp->list_lock);
MEM_SAFE_FREE(link);
/* Search for optimization job in queue. */
LinkData *opti_link = (LinkData *)BLI_findptr(
&comp->optimize_queue, mat, offsetof(LinkData, data));
if (opti_link) {
BLI_remlink(&comp->optimize_queue, opti_link);
GPU_material_optimization_status_set(opti_link->data, GPU_MAT_OPTIMIZATION_READY);
}
BLI_spin_unlock(&comp->list_lock);
MEM_SAFE_FREE(opti_link);
}
}
}
}
void DRW_deferred_shader_optimize_remove(GPUMaterial *mat)
{
LISTBASE_FOREACH (wmWindowManager *, wm, &G_MAIN->wm) {
LISTBASE_FOREACH (wmWindow *, win, &wm->windows) {
DRWShaderCompiler *comp = (DRWShaderCompiler *)WM_jobs_customdata_from_type(
wm, wm, WM_JOB_TYPE_SHADER_COMPILATION);
if (comp != NULL) {
BLI_spin_lock(&comp->list_lock);
/* Search for optimization job in queue. */
LinkData *opti_link = (LinkData *)BLI_findptr(
&comp->optimize_queue, mat, offsetof(LinkData, data));
if (opti_link) {
BLI_remlink(&comp->optimize_queue, opti_link);
GPU_material_optimization_status_set(opti_link->data, GPU_MAT_OPTIMIZATION_READY);
}
BLI_spin_unlock(&comp->list_lock);
MEM_SAFE_FREE(opti_link);
}
}
}
@ -432,6 +512,7 @@ GPUMaterial *DRW_shader_from_world(World *wo,
}
drw_deferred_shader_add(mat, deferred);
DRW_shader_queue_optimize_material(mat);
return mat;
}
@ -463,9 +544,52 @@ GPUMaterial *DRW_shader_from_material(Material *ma,
}
drw_deferred_shader_add(mat, deferred);
DRW_shader_queue_optimize_material(mat);
return mat;
}
void DRW_shader_queue_optimize_material(GPUMaterial *mat)
{
/* Do not perform deferred optimization if performing render.
* De-queue any queued optimization jobs. */
if (DRW_state_is_image_render()) {
if (GPU_material_optimization_status(mat) == GPU_MAT_OPTIMIZATION_QUEUED) {
/* Remove from pending optimization job queue. */
DRW_deferred_shader_optimize_remove(mat);
/* If optimization job had already started, wait for it to complete. */
while (GPU_material_optimization_status(mat) == GPU_MAT_OPTIMIZATION_QUEUED) {
PIL_sleep_ms(20);
}
}
return;
}
/* We do not need to perform optimization on the material if it is already compiled or in the
* optimization queue. If optimization is not required, the status will be flagged as
* `GPU_MAT_OPTIMIZATION_SKIP`.
* We can also skip cases which have already been queued up. */
if (ELEM(GPU_material_optimization_status(mat),
GPU_MAT_OPTIMIZATION_SKIP,
GPU_MAT_OPTIMIZATION_SUCCESS,
GPU_MAT_OPTIMIZATION_QUEUED)) {
return;
}
/* Only queue optimization once the original shader has been successfully compiled. */
if (GPU_material_status(mat) != GPU_MAT_SUCCESS) {
return;
}
/* Defer optimization until sufficient time has passed beyond creation. This avoids excessive
* recompilation for shaders which are being actively modified. */
if (!GPU_material_optimization_ready(mat)) {
return;
}
/* Add deferred shader compilation to queue. */
drw_deferred_queue_append(mat, true);
}
void DRW_shader_free(GPUShader *shader)
{
GPU_shader_free(shader);

View File

@ -254,9 +254,20 @@ void GPU_materials_free(struct Main *bmain);
struct Scene *GPU_material_scene(GPUMaterial *material);
struct GPUPass *GPU_material_get_pass(GPUMaterial *material);
/* Return the most optimal shader configuration for the given material .*/
struct GPUShader *GPU_material_get_shader(GPUMaterial *material);
/* Return the base un-optimized shader. */
struct GPUShader *GPU_material_get_shader_base(GPUMaterial *material);
const char *GPU_material_get_name(GPUMaterial *material);
/**
* Material Optimization.
* \note Compiles optimal version of shader graph, populating mat->optimized_pass.
* This operation should always be deferred until existing compilations have completed.
* Default un-optimized materials will still exist for interactive material editing performance.
*/
void GPU_material_optimize(GPUMaterial *mat);
/**
* Return can be NULL if it's a world material.
*/
@ -274,6 +285,24 @@ eGPUMaterialOptimizationStatus GPU_material_optimization_status(GPUMaterial *mat
void GPU_material_optimization_status_set(GPUMaterial *mat, eGPUMaterialOptimizationStatus status);
bool GPU_material_optimization_ready(GPUMaterial *mat);
/**
* Store reference to a similar default material for async PSO cache warming.
*
* This function expects `material` to have not yet been compiled and for `default_material` to be
* ready. When compiling `material` as part of an async shader compilation job, use existing PSO
* descriptors from `default_material`'s shader to also compile PSOs for this new material
* asynchronously, rather than at runtime.
*
* The default_material `options` should match this new materials options in order
* for PSO descriptors to match those needed by the new `material`.
*
* NOTE: `default_material` must exist when `GPU_material_compile(..)` is called for
* `material`.
*
* See `GPU_shader_warm_cache(..)` for more information.
*/
void GPU_material_set_default(GPUMaterial *material, GPUMaterial *default_material);
struct GPUUniformBuf *GPU_material_uniform_buffer_get(GPUMaterial *material);
/**
* Create dynamic UBO from parameters

View File

@ -217,6 +217,47 @@ GPUShader *GPU_shader_create_ex(const char *vertcode,
bool GPU_shader_transform_feedback_enable(GPUShader *shader, struct GPUVertBuf *vertbuf);
void GPU_shader_transform_feedback_disable(GPUShader *shader);
/**
* Shader cache warming.
* For each shader, rendering APIs perform a two-step compilation:
*
* * The first stage is Front-End compilation which only needs to be performed once, and generates
* a portable intermediate representation. This happens during `gpu::Shader::finalize()`.
*
* * The second is Back-End compilation which compiles a device-specific executable shader
* program. This compilation requires some contextual pipeline state which is baked into the
* executable shader source, producing a Pipeline State Object (PSO). In OpenGL, backend
* compilation happens in the background, within the driver, but can still incur runtime stutters.
* In Metal/Vulkan, PSOs are compiled explicitly. These are currently resolved within the backend
* based on the current pipeline state and can incur runtime stalls when they occur.
*
* Shader Cache warming uses the specified parent shader set using `GPU_shader_set_parent(..)` as a
* template reference for pre-compiling Render Pipeline State Objects (PSOs) outside of the main
* render pipeline.
*
* PSOs require descriptors containing information on the render state for a given shader, which
* includes input vertex data layout and output pixel formats, along with some state such as
* blend mode and colour output masks. As this state information is usually consistent between
* similar draws, we can assign a parent shader and use this shader's cached pipeline state's to
* prime compilations.
*
* Shaders do not necessarily have to be similar in functionality to be used as a parent, so long
* as the GPUVertFormt and GPUFrameBuffer which they are used with remain the same. Other bindings
* such as textures, uniforms and UBOs are all assigned independently as dynamic state.
*
* This function should be called asynchronously, mitigating the impact of run-time stuttering from
* dynamic compilation of PSOs during normal rendering.
*
* \param: shader: The shader whose cache to warm.
* \param limit: The maximum number of PSOs to compile within a call. Specifying
* a limit <= 0 will compile a PSO for all cached PSOs in the parent shader. */
void GPU_shader_warm_cache(GPUShader *shader, int limit);
/* We expect the parent shader to be compiled and already have some cached PSOs when being assigned
* as a reference. Ensure the parent shader still exists when `GPU_shader_cache_warm(..)` is
* called. */
void GPU_shader_set_parent(GPUShader *shader, GPUShader *parent);
/** DEPRECATED: Kept only because of BGL API. */
int GPU_shader_get_program(GPUShader *shader);

View File

@ -25,6 +25,7 @@
#include "BKE_material.h"
#include "GPU_capabilities.h"
#include "GPU_context.h"
#include "GPU_material.h"
#include "GPU_shader.h"
#include "GPU_uniform_buffer.h"
@ -95,6 +96,9 @@ struct GPUPass {
uint32_t hash;
/** Did we already tried to compile the attached GPUShader. */
bool compiled;
/** Hint that an optimized variant of this pass should be created based on a complexity heuristic
* during pass code generation. */
bool should_optimize;
};
/* -------------------------------------------------------------------- */
@ -252,6 +256,11 @@ class GPUCodegen {
ListBase ubo_inputs_ = {nullptr, nullptr};
GPUInput *cryptomatte_input_ = nullptr;
/** Cache paramters for complexity heuristic. */
uint nodes_total_ = 0;
uint textures_total_ = 0;
uint uniforms_total_ = 0;
public:
GPUCodegen(GPUMaterial *mat_, GPUNodeGraph *graph_) : mat(*mat_), graph(*graph_)
{
@ -292,6 +301,17 @@ class GPUCodegen {
return hash_;
}
/* Heuristic determined during pass codegen for whether a
* more optimal variant of this material should be compiled. */
bool should_optimize_heuristic() const
{
/* If each of the maximal attributes are exceeded, we can optimize, but we should also ensure
* the baseline is met.*/
bool do_optimize = (nodes_total_ >= 60 || textures_total_ >= 4 || uniforms_total_ >= 64) &&
(textures_total_ >= 1 && uniforms_total_ >= 8 && nodes_total_ >= 4);
return do_optimize;
}
private:
void set_unique_ids();
@ -413,6 +433,9 @@ void GPUCodegen::generate_resources()
}
}
/* Increment heuristic. */
textures_total_ = slot;
if (!BLI_listbase_is_empty(&ubo_inputs_)) {
/* NOTE: generate_uniform_buffer() should have sorted the inputs before this. */
ss << "struct NodeTree {\n";
@ -454,11 +477,16 @@ void GPUCodegen::generate_library()
GPUCodegenCreateInfo &info = *create_info;
void *value;
GSetIterState pop_state = {};
while (BLI_gset_pop(graph.used_libraries, &pop_state, &value)) {
/* Iterate over libraries. We need to keep this struct intact incase
* it is required for the optimization pass. */
GHashIterator *ihash = BLI_ghashIterator_new((GHash *)graph.used_libraries);
while (!BLI_ghashIterator_done(ihash)) {
value = BLI_ghashIterator_getKey(ihash);
auto deps = gpu_shader_dependency_get_resolved_source((const char *)value);
info.dependencies_generated.extend_non_duplicates(deps);
BLI_ghashIterator_step(ihash);
}
BLI_ghashIterator_free(ihash);
}
void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node)
@ -526,6 +554,9 @@ void GPUCodegen::node_serialize(std::stringstream &eval_ss, const GPUNode *node)
}
}
eval_ss << ");\n\n";
/* Increment heuristic. */
nodes_total_++;
}
char *GPUCodegen::graph_serialize(eGPUNodeTag tree_tag, GPUNodeLink *output_link)
@ -589,6 +620,7 @@ void GPUCodegen::generate_uniform_buffer()
if (input->source == GPU_SOURCE_UNIFORM && !input->link) {
/* We handle the UBO uniforms separately. */
BLI_addtail(&ubo_inputs_, BLI_genericNodeN(input));
uniforms_total_++;
}
}
}
@ -661,10 +693,17 @@ void GPUCodegen::generate_graphs()
GPUPass *GPU_generate_pass(GPUMaterial *material,
GPUNodeGraph *graph,
GPUCodegenCallbackFn finalize_source_cb,
void *thunk)
void *thunk,
bool optimize_graph)
{
gpu_node_graph_prune_unused(graph);
/* If Optimize flag is passed in, we are generating an optimized
* variant of the GPUMaterial's GPUPass. */
if (optimize_graph) {
gpu_node_graph_optimize(graph);
}
/* Extract attributes before compiling so the generated VBOs are ready to accept the future
* shader. */
gpu_node_graph_finalize_uniform_attrs(graph);
@ -672,23 +711,33 @@ GPUPass *GPU_generate_pass(GPUMaterial *material,
GPUCodegen codegen(material, graph);
codegen.generate_graphs();
codegen.generate_cryptomatte();
codegen.generate_uniform_buffer();
/* Cache lookup: Reuse shaders already compiled. */
GPUPass *pass_hash = gpu_pass_cache_lookup(codegen.hash_get());
GPUPass *pass_hash = nullptr;
/* FIXME(fclem): This is broken. Since we only check for the hash and not the full source
* there is no way to have a collision currently. Some advocated to only use a bigger hash. */
if (pass_hash && (pass_hash->next == nullptr || pass_hash->next->hash != codegen.hash_get())) {
if (!gpu_pass_is_valid(pass_hash)) {
/* Shader has already been created but failed to compile. */
return nullptr;
if (!optimize_graph) {
/* The optimized version of the shader should not re-generate a UBO.
* The UBO will not be used for this variant. */
codegen.generate_uniform_buffer();
/** Cache lookup: Reuse shaders already compiled.
* NOTE: We only perform cache look-up for non-optimized shader
* graphs, as baked constant data amongst other optimizations will generate too many
* shader source permutations, with minimal re-usability. */
pass_hash = gpu_pass_cache_lookup(codegen.hash_get());
/* FIXME(fclem): This is broken. Since we only check for the hash and not the full source
* there is no way to have a collision currently. Some advocated to only use a bigger hash. */
if (pass_hash && (pass_hash->next == nullptr || pass_hash->next->hash != codegen.hash_get())) {
if (!gpu_pass_is_valid(pass_hash)) {
/* Shader has already been created but failed to compile. */
return nullptr;
}
/* No collision, just return the pass. */
BLI_spin_lock(&pass_cache_spin);
pass_hash->refcount += 1;
BLI_spin_unlock(&pass_cache_spin);
return pass_hash;
}
/* No collision, just return the pass. */
BLI_spin_lock(&pass_cache_spin);
pass_hash->refcount += 1;
BLI_spin_unlock(&pass_cache_spin);
return pass_hash;
}
/* Either the shader is not compiled or there is a hash collision...
@ -726,14 +775,33 @@ GPUPass *GPU_generate_pass(GPUMaterial *material,
pass->create_info = codegen.create_info;
pass->hash = codegen.hash_get();
pass->compiled = false;
/* Only flag pass optimization hint if this is the first generated pass for a material.
* Optimized passes cannot be optimized further, even if the heuristic is still not
* favourable. */
pass->should_optimize = (!optimize_graph) && codegen.should_optimize_heuristic();
codegen.create_info = nullptr;
gpu_pass_cache_insert_after(pass_hash, pass);
/* Only insert non-optimized graphs into cache.
* Optimized graphs will continuously be recompiled with new unique source during material
* editing, and thus causing the cache to fill up quickly with materials offering minimal
* re-use. */
if (!optimize_graph) {
gpu_pass_cache_insert_after(pass_hash, pass);
}
}
return pass;
}
bool GPU_pass_should_optimize(GPUPass *pass)
{
/* Returns optimization heuristic prepared during
* initial codegen.
* NOTE: Optimization currently limited to Metal backend as repeated compilations required for
* material specialization cause impactful CPU stalls on OpenGL platforms. */
return (GPU_backend_get_type() == GPU_BACKEND_METAL) && pass->should_optimize;
}
/** \} */
/* -------------------------------------------------------------------- */

View File

@ -25,10 +25,12 @@ typedef struct GPUPass GPUPass;
GPUPass *GPU_generate_pass(GPUMaterial *material,
struct GPUNodeGraph *graph,
GPUCodegenCallbackFn finalize_source_cb,
void *thunk);
void *thunk,
bool optimize_graph);
GPUShader *GPU_pass_shader_get(GPUPass *pass);
bool GPU_pass_compile(GPUPass *pass, const char *shname);
void GPU_pass_release(GPUPass *pass);
bool GPU_pass_should_optimize(GPUPass *pass);
/* Module */

View File

@ -34,6 +34,8 @@
#include "DRW_engine.h"
#include "PIL_time.h"
#include "gpu_codegen.h"
#include "gpu_node_graph.h"
@ -43,6 +45,17 @@
#define MAX_COLOR_BAND 128
#define MAX_GPU_SKIES 8
/** Whether the optimized variant of the GPUPass should be created asynchronously.
* Usage of this depends on whether there are possible threading challenges of doing so.
* Currently, the overhead of GPU_generate_pass is relatively small in comparison to shader
* compilation, though this option exists in case any potential scenarios for material graph
* optimization cause a slow down on the main thread.
*
* NOTE: The actual shader program for the optimized pass will always be compiled asynchronously,
* this flag controls whether shader node graph source serialization happens on the compilation
* worker thread as well. */
#define ASYNC_OPTIMIZED_PASS_CREATION 0
typedef struct GPUColorBandBuilder {
float pixels[MAX_COLOR_BAND][CM_TABLE + 1][4];
int current_layer;
@ -57,6 +70,27 @@ struct GPUMaterial {
/* Contains #GPUShader and source code for deferred compilation.
* Can be shared between similar material (i.e: sharing same node-tree topology). */
GPUPass *pass;
/* Optimized GPUPass, situationally compiled after initial pass for optimal realtime performance.
* This shader variant bakes dynamic uniform data as constant. This variant will not use
* the ubo, and instead bake constants directly into the shader source. */
GPUPass *optimized_pass;
/* Optimization status.
* We also use this status to determine whether this material should be considered for
* optimization. Only sufficiently complex shaders benefit from constant-folding optimizations.
* `GPU_MAT_OPTIMIZATION_READY` -> shader should be optimized and is ready for optimization.
* `GPU_MAT_OPTIMIZATION_SKIP` -> Shader should not be optimized as it would not benefit
* performance to do so, based on the heuristic.
*/
eGPUMaterialOptimizationStatus optimization_status;
double creation_time;
#if ASYNC_OPTIMIZED_PASS_CREATION == 1
struct DeferredOptimizePass {
GPUCodegenCallbackFn callback;
void *thunk;
} DeferredOptimizePass;
struct DeferredOptimizePass optimize_pass_info;
#endif
/** UBOs for this material parameters. */
GPUUniformBuf *ubo;
/** Compilation status. Do not use if shader is not GPU_MAT_SUCCESS. */
@ -86,6 +120,12 @@ struct GPUMaterial {
/* Low level node graph(s). Also contains resources needed by the material. */
GPUNodeGraph graph;
/** Default material reference used for PSO cache warming. Default materials may perform
* different operations, but the permutation will frequently share the same input PSO
* descriptors. This enables async PSO compilation as part of the deferred compiltion
* pass, reducing runtime stuttering and responsiveness while compiling materials. */
GPUMaterial *default_mat;
/** DEPRECATED: To remove. */
bool has_surface_output;
bool has_volume_output;
@ -214,6 +254,9 @@ void GPU_material_free_single(GPUMaterial *material)
gpu_node_graph_free(&material->graph);
if (material->optimized_pass != NULL) {
GPU_pass_release(material->optimized_pass);
}
if (material->pass != NULL) {
GPU_pass_release(material->pass);
}
@ -252,12 +295,29 @@ Scene *GPU_material_scene(GPUMaterial *material)
GPUPass *GPU_material_get_pass(GPUMaterial *material)
{
return material->pass;
/* If an optimized pass variant is available, and optimization is
* flagged as complete, we use this one instead. */
return ((GPU_material_optimization_status(material) == GPU_MAT_OPTIMIZATION_SUCCESS) &&
material->optimized_pass) ?
material->optimized_pass :
material->pass;
}
GPUShader *GPU_material_get_shader(GPUMaterial *material)
{
return material->pass ? GPU_pass_shader_get(material->pass) : NULL;
/* If an optimized material shader variant is available, and optimization is
* flagged as complete, we use this one instead. */
GPUShader *shader = ((GPU_material_optimization_status(material) ==
GPU_MAT_OPTIMIZATION_SUCCESS) &&
material->optimized_pass) ?
GPU_pass_shader_get(material->optimized_pass) :
NULL;
return (shader) ? shader : ((material->pass) ? GPU_pass_shader_get(material->pass) : NULL);
}
GPUShader *GPU_material_get_shader_base(GPUMaterial *material)
{
return (material->pass) ? GPU_pass_shader_get(material->pass) : NULL;
}
const char *GPU_material_get_name(GPUMaterial *material)
@ -665,6 +725,41 @@ void GPU_material_status_set(GPUMaterial *mat, eGPUMaterialStatus status)
mat->status = status;
}
eGPUMaterialOptimizationStatus GPU_material_optimization_status(GPUMaterial *mat)
{
return mat->optimization_status;
}
void GPU_material_optimization_status_set(GPUMaterial *mat, eGPUMaterialOptimizationStatus status)
{
mat->optimization_status = status;
if (mat->optimization_status == GPU_MAT_OPTIMIZATION_READY) {
/* Reset creation timer to delay optimization pass. */
mat->creation_time = PIL_check_seconds_timer();
}
}
bool GPU_material_optimization_ready(GPUMaterial *mat)
{
/* Timer threshold before optimizations will be queued.
* When materials are frequently being modified, optimization
* can incur CPU overhead from excessive compilation.
*
* As the optimization is entirely asynchronous, it is still beneficial
* to do this quickly to avoid build-up and improve runtime performance.
* The threshold just prevents compilations being queued frame after frame. */
const double optimization_time_threshold_s = 1.2;
return ((PIL_check_seconds_timer() - mat->creation_time) >= optimization_time_threshold_s);
}
void GPU_material_set_default(GPUMaterial *material, GPUMaterial *default_material)
{
BLI_assert(material != default_material);
if (material != default_material) {
material->default_mat = default_material;
}
}
/* Code generation */
bool GPU_material_has_surface_output(GPUMaterial *mat)
@ -730,6 +825,7 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,
mat->uuid = shader_uuid;
mat->flag = GPU_MATFLAG_UPDATED;
mat->status = GPU_MAT_CREATED;
mat->default_mat = NULL;
mat->is_volume_shader = is_volume_shader;
mat->graph.used_libraries = BLI_gset_new(
BLI_ghashutil_ptrhash, BLI_ghashutil_ptrcmp, "GPUNodeGraph.used_libraries");
@ -748,7 +844,7 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,
{
/* Create source code and search pass cache for an already compiled version. */
mat->pass = GPU_generate_pass(mat, &mat->graph, callback, thunk);
mat->pass = GPU_generate_pass(mat, &mat->graph, callback, thunk, false);
if (mat->pass == NULL) {
/* We had a cache hit and the shader has already failed to compile. */
@ -756,11 +852,44 @@ GPUMaterial *GPU_material_from_nodetree(Scene *scene,
gpu_node_graph_free(&mat->graph);
}
else {
/* Determine whether we should generate an optimized variant of the graph.
* Heuristic is based on complexity of default material pass and shader node graph. */
if (GPU_pass_should_optimize(mat->pass)) {
GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_READY);
}
GPUShader *sh = GPU_pass_shader_get(mat->pass);
if (sh != NULL) {
/* We had a cache hit and the shader is already compiled. */
mat->status = GPU_MAT_SUCCESS;
gpu_node_graph_free_nodes(&mat->graph);
if (mat->optimization_status == GPU_MAT_OPTIMIZATION_SKIP) {
gpu_node_graph_free_nodes(&mat->graph);
}
}
/* Generate optimized pass. */
if (mat->optimization_status == GPU_MAT_OPTIMIZATION_READY) {
#if ASYNC_OPTIMIZED_PASS_CREATION == 1
mat->optimized_pass = NULL;
mat->optimize_pass_info.callback = callback;
mat->optimize_pass_info.thunk = thunk;
#else
mat->optimized_pass = GPU_generate_pass(mat, &mat->graph, callback, thunk, true);
if (mat->optimized_pass == NULL) {
/* Failed to create optimized pass. */
gpu_node_graph_free_nodes(&mat->graph);
GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SKIP);
}
else {
GPUShader *optimized_sh = GPU_pass_shader_get(mat->optimized_pass);
if (optimized_sh != NULL) {
/* Optimized shader already available. */
gpu_node_graph_free_nodes(&mat->graph);
GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SUCCESS);
}
}
#endif
}
}
}
@ -810,8 +939,37 @@ void GPU_material_compile(GPUMaterial *mat)
if (success) {
GPUShader *sh = GPU_pass_shader_get(mat->pass);
if (sh != NULL) {
/** Perform async Render Pipeline State Object (PSO) compilation.
*
* Warm PSO cache within async compilation thread using default material as source.
* GPU_shader_warm_cache(..) performs the API-specific PSO compilation using the assigned
* parent shader's cached PSO descriptors as an input.
*
* This is only applied if the given material has a specified default reference
* material available, and the default material is already compiled.
*
* As PSOs do not always match for default shaders, we limit warming for PSO
* configurations to ensure compile time remains fast, as these first
* entries will be the most commonly used PSOs. As not all PSOs are necesasrily
* required immediately, this limit should remain low (1-3 at most).
* */
if (mat->default_mat != NULL && mat->default_mat != mat) {
if (mat->default_mat->pass != NULL) {
GPUShader *parent_sh = GPU_pass_shader_get(mat->default_mat->pass);
if (parent_sh) {
GPU_shader_set_parent(sh, parent_sh);
GPU_shader_warm_cache(sh, 1);
}
}
}
/* Flag success. */
mat->status = GPU_MAT_SUCCESS;
gpu_node_graph_free_nodes(&mat->graph);
if (mat->optimization_status == GPU_MAT_OPTIMIZATION_SKIP) {
/* Only free node graph nodes if not required by secondary optimization pass. */
gpu_node_graph_free_nodes(&mat->graph);
}
}
else {
mat->status = GPU_MAT_FAILED;
@ -825,6 +983,89 @@ void GPU_material_compile(GPUMaterial *mat)
}
}
void GPU_material_optimize(GPUMaterial *mat)
{
/* If shader is flagged for skipping optimization or has already been successfully
* optimized, skip. */
if (ELEM(mat->optimization_status, GPU_MAT_OPTIMIZATION_SKIP, GPU_MAT_OPTIMIZATION_SUCCESS)) {
return;
}
/* If original shader has not been fully compiled, we are not
* ready to perform optimization. */
if (mat->status != GPU_MAT_SUCCESS) {
/* Reset optimization status. */
GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_READY);
return;
}
#if ASYNC_OPTIMIZED_PASS_CREATION == 1
/* If the optimized pass is not valid, first generate optimized pass.
* NOTE(Threading): Need to verify if GPU_generate_pass can cause side-effects, especially when
* used with "thunk". So far, this appears to work, and deferring optimized pass creation is more
* optimal, as these do not benefit from caching, due to baked constants. However, this could
* possibly be cause for concern for certain cases. */
if (!mat->optimized_pass) {
mat->optimized_pass = GPU_generate_pass(
mat, &mat->graph, mat->optimize_pass_info.callback, mat->optimize_pass_info.thunk, true);
BLI_assert(mat->optimized_pass);
}
#else
if (!mat->optimized_pass) {
/* Optimized pass has not been created, skip future optimization attempts. */
GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SKIP);
return;
}
#endif
bool success;
/* NOTE: The shader may have already been compiled here since we are
* sharing GPUShader across GPUMaterials. In this case it's a no-op. */
#ifndef NDEBUG
success = GPU_pass_compile(mat->optimized_pass, mat->name);
#else
success = GPU_pass_compile(mat->optimized_pass, __func__);
#endif
if (success) {
GPUShader *sh = GPU_pass_shader_get(mat->optimized_pass);
if (sh != NULL) {
/** Perform async Render Pipeline State Object (PSO) compilation.
*
* Warm PSO cache within async compilation thread for optimized materials.
* This setup assigns the original unoptimized shader as a "parent" shader
* for the optimized version. This then allows the associated GPU backend to
* compile PSOs within this asynchronous pass, using the identical PSO descriptors of the
* parent shader.
*
* This eliminates all run-time stuttering associated with material optimization and ensures
* realtime material editing and animation remains seamless, while retaining optimal realtime
* performance. */
GPUShader *parent_sh = GPU_pass_shader_get(mat->pass);
if (parent_sh) {
GPU_shader_set_parent(sh, parent_sh);
GPU_shader_warm_cache(sh, -1);
}
/* Mark as complete. */
GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SUCCESS);
}
else {
/* Optimized pass failed to compile. Disable any future optimization attempts. */
GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SKIP);
}
}
else {
/* Optimization pass generation failed. Disable future attempts to optimize. */
GPU_pass_release(mat->optimized_pass);
mat->optimized_pass = NULL;
GPU_material_optimization_status_set(mat, GPU_MAT_OPTIMIZATION_SKIP);
}
/* Release node graph as no longer needed. */
gpu_node_graph_free_nodes(&mat->graph);
}
void GPU_materials_free(Main *bmain)
{
LISTBASE_FOREACH (Material *, ma, &bmain->materials) {
@ -847,6 +1088,9 @@ GPUMaterial *GPU_material_from_callbacks(ConstructGPUMaterialFn construct_functi
material->graph.used_libraries = BLI_gset_new(
BLI_ghashutil_ptrhash, BLI_ghashutil_ptrcmp, "GPUNodeGraph.used_libraries");
material->refcount = 1;
material->optimization_status = GPU_MAT_OPTIMIZATION_SKIP;
material->optimized_pass = NULL;
material->default_mat = NULL;
/* Construct the material graph by adding and linking the necessary GPU material nodes. */
construct_function_cb(thunk, material);
@ -855,7 +1099,9 @@ GPUMaterial *GPU_material_from_callbacks(ConstructGPUMaterialFn construct_functi
gpu_material_ramp_texture_build(material);
/* Lookup an existing pass in the cache or generate a new one. */
material->pass = GPU_generate_pass(material, &material->graph, generate_code_function_cb, thunk);
material->pass = GPU_generate_pass(
material, &material->graph, generate_code_function_cb, thunk, false);
material->optimized_pass = NULL;
/* The pass already exists in the pass cache but its shader already failed to compile. */
if (material->pass == NULL) {
@ -868,7 +1114,10 @@ GPUMaterial *GPU_material_from_callbacks(ConstructGPUMaterialFn construct_functi
GPUShader *shader = GPU_pass_shader_get(material->pass);
if (shader != NULL) {
material->status = GPU_MAT_SUCCESS;
gpu_node_graph_free_nodes(&material->graph);
if (material->optimization_status == GPU_MAT_OPTIMIZATION_SKIP) {
/* Only free node graph if not required by secondary optimization pass. */
gpu_node_graph_free_nodes(&material->graph);
}
return material;
}

View File

@ -983,3 +983,22 @@ void gpu_node_graph_prune_unused(GPUNodeGraph *graph)
}
}
}
void gpu_node_graph_optimize(GPUNodeGraph *graph)
{
/* Replace all uniform node links with constant. */
LISTBASE_FOREACH (GPUNode *, node, &graph->nodes) {
LISTBASE_FOREACH (GPUInput *, input, &node->inputs) {
if (input->link) {
if (input->link->link_type == GPU_NODE_LINK_UNIFORM) {
input->link->link_type = GPU_NODE_LINK_CONSTANT;
}
}
if (input->source == GPU_SOURCE_UNIFORM) {
input->source = (input->type == GPU_CLOSURE) ? GPU_SOURCE_STRUCT : GPU_SOURCE_CONSTANT;
}
}
}
/* TODO: Consider performing other node graph optimizations here. */
}

View File

@ -190,6 +190,19 @@ void gpu_nodes_tag(GPUNodeLink *link, eGPUNodeTag tag);
void gpu_node_graph_prune_unused(GPUNodeGraph *graph);
void gpu_node_graph_finalize_uniform_attrs(GPUNodeGraph *graph);
/**
* Optimize node graph for optimized material shader path.
* Once the base material has been generated, we can modify the shader
* node graph to create one which will produce an optimally performing shader.
* This currently involves baking uniform data into constant data to enable
* aggressive constant folding by the compiler in order to reduce complexity and
* shader core memory pressure.
*
* NOTE: Graph optimizations will produce a shader which needs to be re-compiled
* more frequently, however, the default material pass will always exist to fall
* back on. */
void gpu_node_graph_optimize(GPUNodeGraph *graph);
/**
* Free intermediate node graph.
*/

View File

@ -500,6 +500,26 @@ const char *GPU_shader_get_name(GPUShader *shader)
return unwrap(shader)->name_get();
}
/* -------------------------------------------------------------------- */
/** \name Shader cache warming
* \{ */
void GPU_shader_set_parent(GPUShader *shader, GPUShader *parent)
{
BLI_assert(shader != nullptr);
BLI_assert(shader != parent);
if (shader != parent) {
Shader *shd_child = unwrap(shader);
Shader *shd_parent = unwrap(parent);
shd_child->parent_set(shd_parent);
}
}
void GPU_shader_warm_cache(GPUShader *shader, int limit)
{
unwrap(shader)->warm_cache(limit);
}
/** \} */
/* -------------------------------------------------------------------- */

View File

@ -34,6 +34,12 @@ class Shader {
/** For debugging purpose. */
char name[64];
/* Parent shader can be used for shaders which are derived from the same source material.
* The child shader can pull information from its parent to prepare additional resources
* such as PSOs upfront. This enables asynchronous PSO compilation which mitigates stuttering
* when updating new materials. */
Shader *parent_shader_ = nullptr;
public:
Shader(const char *name);
virtual ~Shader();
@ -43,6 +49,11 @@ class Shader {
virtual void fragment_shader_from_glsl(MutableSpan<const char *> sources) = 0;
virtual void compute_shader_from_glsl(MutableSpan<const char *> sources) = 0;
virtual bool finalize(const shader::ShaderCreateInfo *info = nullptr) = 0;
/* Pre-warms PSOs using parent shader's cached PSO descriptors. Limit specifies maximum PSOs to
* warm. If -1, compiles all PSO permutations in parent shader.
*
* See `GPU_shader_warm_cache(..)` in `GPU_shader.h` for more information. */
virtual void warm_cache(int limit) = 0;
virtual void transform_feedback_names_set(Span<const char *> name_list,
eGPUShaderTFBType geom_type) = 0;
@ -69,7 +80,17 @@ class Shader {
inline const char *const name_get() const
{
return name;
};
}
inline void parent_set(Shader *parent)
{
parent_shader_ = parent;
}
inline Shader *parent_get() const
{
return parent_shader_;
}
static bool srgb_uniform_dirty_get();
static void set_srgb_uniform(GPUShader *shader);

View File

@ -31,6 +31,14 @@ struct MTLVertexAttributeDescriptorPSO {
return uint64_t((uint64_t(this->format) ^ (this->offset << 4) ^ (this->buffer_index << 8) ^
(this->format_conversion_mode << 12)));
}
void reset()
{
format = MTLVertexFormatInvalid;
offset = 0;
buffer_index = 0;
format_conversion_mode = GPU_FETCH_FLOAT;
}
};
struct MTLVertexBufferLayoutDescriptorPSO {
@ -48,6 +56,13 @@ struct MTLVertexBufferLayoutDescriptorPSO {
{
return uint64_t(uint64_t(this->step_function) ^ (this->step_rate << 4) ^ (this->stride << 8));
}
void reset()
{
step_function = MTLVertexStepFunctionPerVertex;
step_rate = 1;
stride = 0;
}
};
/* SSBO attribute state caching. */
@ -76,6 +91,16 @@ struct MTLSSBOAttribute {
{
return (memcmp(this, &other, sizeof(MTLSSBOAttribute)) == 0);
}
void reset()
{
mtl_attribute_index = 0;
vbo_id = 0;
attribute_offset = 0;
per_vertex_stride = 0;
attribute_format = 0;
is_instance = false;
}
};
struct MTLVertexDescriptor {
@ -241,10 +266,10 @@ struct MTLRenderPipelineStateDescriptor {
hash ^= uint64_t(this->dest_rgb_blend_factor) << 37; /* Up to 18 (5 bits). */
hash ^= uint64_t(this->src_alpha_blend_factor) << 42; /* Up to 18 (5 bits). */
hash ^= uint64_t(this->src_rgb_blend_factor) << 47; /* Up to 18 (5 bits). */
}
for (const uint c : IndexRange(GPU_FB_MAX_COLOR_ATTACHMENT)) {
hash ^= uint64_t(this->color_attachment_format[c]) << (c + 52); /* Up to 555 (9 bits). */
for (const uint c : IndexRange(GPU_FB_MAX_COLOR_ATTACHMENT)) {
hash ^= uint64_t(this->color_attachment_format[c]) << (c + 52); /* Up to 555 (9 bits). */
}
}
hash |= uint64_t((this->blending_enabled && (this->num_color_attachments > 0)) ? 1 : 0) << 62;
@ -262,9 +287,9 @@ struct MTLRenderPipelineStateDescriptor {
vertex_descriptor.total_attributes = 0;
vertex_descriptor.max_attribute_value = 0;
vertex_descriptor.num_vert_buffers = 0;
vertex_descriptor.prim_topology_class = MTLPrimitiveTopologyClassUnspecified;
for (int i = 0; i < GPU_VERT_ATTR_MAX_LEN; i++) {
vertex_descriptor.attributes[i].format = MTLVertexFormatInvalid;
vertex_descriptor.attributes[i].offset = 0;
vertex_descriptor.attributes[i].reset();
}
vertex_descriptor.uses_ssbo_vertex_fetch = false;
vertex_descriptor.num_ssbo_attributes = 0;

View File

@ -76,6 +76,8 @@ struct MTLRenderPipelineStateInstance {
int null_attribute_buffer_index;
/* buffer bind used for transform feedback output buffer. */
int transform_feedback_buffer_index;
/* Topology class. */
MTLPrimitiveTopologyClass prim_type;
/** Reflection Data.
* Currently used to verify whether uniform buffers of incorrect sizes being bound, due to left
@ -188,6 +190,7 @@ class MTLShader : public Shader {
MTLRenderPipelineStateDescriptor current_pipeline_state_;
/* Cache of compiled PipelineStateObjects. */
blender::Map<MTLRenderPipelineStateDescriptor, MTLRenderPipelineStateInstance *> pso_cache_;
std::mutex pso_cache_lock_;
/** Compute pipeline state and Compute PSO caching. */
MTLComputePipelineStateInstance compute_pso_instance_;
@ -256,6 +259,7 @@ class MTLShader : public Shader {
/* Compile and build - Return true if successful. */
bool finalize(const shader::ShaderCreateInfo *info = nullptr) override;
bool finalize_compute(const shader::ShaderCreateInfo *info);
void warm_cache(int limit) override;
/* Utility. */
bool is_valid()
@ -331,8 +335,14 @@ class MTLShader : public Shader {
void shader_source_from_msl(NSString *input_vertex_source, NSString *input_fragment_source);
void shader_compute_source_from_msl(NSString *input_compute_source);
void set_interface(MTLShaderInterface *interface);
MTLRenderPipelineStateInstance *bake_current_pipeline_state(MTLContext *ctx,
MTLPrimitiveTopologyClass prim_type);
MTLRenderPipelineStateInstance *bake_pipeline_state(
MTLContext *ctx,
MTLPrimitiveTopologyClass prim_type,
const MTLRenderPipelineStateDescriptor &pipeline_descriptor);
bool bake_compute_pipeline_state(MTLContext *ctx);
const MTLComputePipelineStateInstance &get_compute_pipeline_state();

View File

@ -6,6 +6,8 @@
#include "BKE_global.h"
#include "PIL_time.h"
#include "BLI_string.h"
#include <algorithm>
#include <fstream>
@ -110,6 +112,7 @@ MTLShader::~MTLShader()
}
/* Free Pipeline Cache. */
pso_cache_lock_.lock();
for (const MTLRenderPipelineStateInstance *pso_inst : pso_cache_.values()) {
if (pso_inst->vert) {
[pso_inst->vert release];
@ -123,6 +126,7 @@ MTLShader::~MTLShader()
delete pso_inst;
}
pso_cache_.clear();
pso_cache_lock_.unlock();
/* Free Compute pipeline state object. */
if (compute_pso_instance_.compute) {
@ -616,6 +620,36 @@ void MTLShader::push_constant_bindstate_mark_dirty(bool is_dirty)
push_constant_modified_ = is_dirty;
}
void MTLShader::warm_cache(int limit)
{
if (parent_shader_ != nullptr) {
MTLContext *ctx = MTLContext::get();
MTLShader *parent_mtl = reinterpret_cast<MTLShader *>(parent_shader_);
/* Extract PSO descriptors from parent shader. */
blender::Vector<MTLRenderPipelineStateDescriptor> descriptors;
blender::Vector<MTLPrimitiveTopologyClass> prim_classes;
parent_mtl->pso_cache_lock_.lock();
for (const auto &pso_entry : parent_mtl->pso_cache_.items()) {
const MTLRenderPipelineStateDescriptor &pso_descriptor = pso_entry.key;
const MTLRenderPipelineStateInstance *pso_inst = pso_entry.value;
descriptors.append(pso_descriptor);
prim_classes.append(pso_inst->prim_type);
}
parent_mtl->pso_cache_lock_.unlock();
/* Warm shader cache with applied limit.
* If limit is <= 0, compile all PSO permutations. */
limit = (limit > 0) ? limit : descriptors.size();
for (int i : IndexRange(min_ii(descriptors.size(), limit))) {
const MTLRenderPipelineStateDescriptor &pso_descriptor = descriptors[i];
const MTLPrimitiveTopologyClass &prim_class = prim_classes[i];
bake_pipeline_state(ctx, prim_class, pso_descriptor);
}
}
}
/** \} */
/* -------------------------------------------------------------------- */
@ -681,12 +715,10 @@ void MTLShader::set_interface(MTLShaderInterface *interface)
MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
MTLContext *ctx, MTLPrimitiveTopologyClass prim_type)
{
/** Populate global pipeline descriptor and use this to prepare new PSO. */
/* NOTE(Metal): PSO cache can be accessed from multiple threads, though these operations should
* be thread-safe due to organization of high-level renderer. If there are any issues, then
* access can be guarded as appropriate. */
BLI_assert(this);
MTLShaderInterface *mtl_interface = this->get_interface();
BLI_assert(mtl_interface);
BLI_assert(this->is_valid());
/* NOTE(Metal): Vertex input assembly description will have been populated externally
@ -756,15 +788,32 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
pipeline_descriptor.vertex_descriptor.prim_topology_class =
(requires_specific_topology_class) ? prim_type : MTLPrimitiveTopologyClassUnspecified;
/* Bake pipeline state using global descriptor. */
return bake_pipeline_state(ctx, prim_type, pipeline_descriptor);
}
/* Variant which bakes a pipeline state based on an an existing MTLRenderPipelineStateDescriptor.
* This function should be callable from a secondary compilatiom thread. */
MTLRenderPipelineStateInstance *MTLShader::bake_pipeline_state(
MTLContext *ctx,
MTLPrimitiveTopologyClass prim_type,
const MTLRenderPipelineStateDescriptor &pipeline_descriptor)
{
/* Fetch shader interface. */
MTLShaderInterface *mtl_interface = this->get_interface();
BLI_assert(mtl_interface);
BLI_assert(this->is_valid());
/* Check if current PSO exists in the cache. */
pso_cache_lock_.lock();
MTLRenderPipelineStateInstance **pso_lookup = pso_cache_.lookup_ptr(pipeline_descriptor);
MTLRenderPipelineStateInstance *pipeline_state = (pso_lookup) ? *pso_lookup : nullptr;
pso_cache_lock_.unlock();
if (pipeline_state != nullptr) {
return pipeline_state;
}
shader_debug_printf("Baking new pipeline variant for shader: %s\n", this->name);
/* Generate new Render Pipeline State Object (PSO). */
@autoreleasepool {
/* Prepare Render Pipeline Descriptor. */
@ -774,7 +823,6 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
MTLFunctionConstantValues *values = [[MTLFunctionConstantValues new] autorelease];
/* Prepare Vertex descriptor based on current pipeline vertex binding state. */
MTLRenderPipelineStateDescriptor &current_state = pipeline_descriptor;
MTLRenderPipelineDescriptor *desc = pso_descriptor_;
[desc reset];
pso_descriptor_.label = [NSString stringWithUTF8String:this->name];
@ -784,7 +832,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
* specialization constant, customized per unique pipeline state permutation.
*
* NOTE: For binding point compaction, we could use the number of VBOs present
* in the current PSO configuration `current_state.vertex_descriptor.num_vert_buffers`).
* in the current PSO configuration `pipeline_descriptors.vertex_descriptor.num_vert_buffers`).
* However, it is more efficient to simply offset the uniform buffer base index to the
* maximal number of VBO bind-points, as then UBO bind-points for similar draw calls
* will align and avoid the requirement for additional binding. */
@ -792,7 +840,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
/* Null buffer index is used if an attribute is not found in the
* bound VBOs #VertexFormat. */
int null_buffer_index = current_state.vertex_descriptor.num_vert_buffers;
int null_buffer_index = pipeline_descriptor.vertex_descriptor.num_vert_buffers;
bool using_null_buffer = false;
if (this->get_uses_ssbo_vertex_fetch()) {
@ -806,11 +854,12 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
MTL_uniform_buffer_base_index = MTL_SSBO_VERTEX_FETCH_IBO_INDEX + 1;
}
else {
for (const uint i : IndexRange(current_state.vertex_descriptor.max_attribute_value + 1)) {
for (const uint i :
IndexRange(pipeline_descriptor.vertex_descriptor.max_attribute_value + 1)) {
/* Metal back-end attribute descriptor state. */
MTLVertexAttributeDescriptorPSO &attribute_desc =
current_state.vertex_descriptor.attributes[i];
const MTLVertexAttributeDescriptorPSO &attribute_desc =
pipeline_descriptor.vertex_descriptor.attributes[i];
/* Flag format conversion */
/* In some cases, Metal cannot implicitly convert between data types.
@ -860,10 +909,10 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
mtl_attribute.bufferIndex = attribute_desc.buffer_index;
}
for (const uint i : IndexRange(current_state.vertex_descriptor.num_vert_buffers)) {
for (const uint i : IndexRange(pipeline_descriptor.vertex_descriptor.num_vert_buffers)) {
/* Metal back-end state buffer layout. */
const MTLVertexBufferLayoutDescriptorPSO &buf_layout =
current_state.vertex_descriptor.buffer_layouts[i];
pipeline_descriptor.vertex_descriptor.buffer_layouts[i];
/* Copy metal back-end buffer layout state into PSO descriptor.
* NOTE: need to copy each element due to copying from internal
* back-end descriptor to Metal API descriptor. */
@ -875,7 +924,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
}
/* Mark empty attribute conversion. */
for (int i = current_state.vertex_descriptor.max_attribute_value + 1;
for (int i = pipeline_descriptor.vertex_descriptor.max_attribute_value + 1;
i < GPU_VERT_ATTR_MAX_LEN;
i++) {
int MTL_attribute_conversion_mode = 0;
@ -1039,7 +1088,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
for (int color_attachment = 0; color_attachment < GPU_FB_MAX_COLOR_ATTACHMENT;
color_attachment++) {
/* Fetch color attachment pixel format in back-end pipeline state. */
MTLPixelFormat pixel_format = current_state.color_attachment_format[color_attachment];
MTLPixelFormat pixel_format = pipeline_descriptor.color_attachment_format[color_attachment];
/* Populate MTL API PSO attachment descriptor. */
MTLRenderPipelineColorAttachmentDescriptor *col_attachment =
desc.colorAttachments[color_attachment];
@ -1048,19 +1097,19 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
if (pixel_format != MTLPixelFormatInvalid) {
bool format_supports_blending = mtl_format_supports_blending(pixel_format);
col_attachment.writeMask = current_state.color_write_mask;
col_attachment.blendingEnabled = current_state.blending_enabled &&
col_attachment.writeMask = pipeline_descriptor.color_write_mask;
col_attachment.blendingEnabled = pipeline_descriptor.blending_enabled &&
format_supports_blending;
if (format_supports_blending && current_state.blending_enabled) {
col_attachment.alphaBlendOperation = current_state.alpha_blend_op;
col_attachment.rgbBlendOperation = current_state.rgb_blend_op;
col_attachment.destinationAlphaBlendFactor = current_state.dest_alpha_blend_factor;
col_attachment.destinationRGBBlendFactor = current_state.dest_rgb_blend_factor;
col_attachment.sourceAlphaBlendFactor = current_state.src_alpha_blend_factor;
col_attachment.sourceRGBBlendFactor = current_state.src_rgb_blend_factor;
if (format_supports_blending && pipeline_descriptor.blending_enabled) {
col_attachment.alphaBlendOperation = pipeline_descriptor.alpha_blend_op;
col_attachment.rgbBlendOperation = pipeline_descriptor.rgb_blend_op;
col_attachment.destinationAlphaBlendFactor = pipeline_descriptor.dest_alpha_blend_factor;
col_attachment.destinationRGBBlendFactor = pipeline_descriptor.dest_rgb_blend_factor;
col_attachment.sourceAlphaBlendFactor = pipeline_descriptor.src_alpha_blend_factor;
col_attachment.sourceRGBBlendFactor = pipeline_descriptor.src_rgb_blend_factor;
}
else {
if (current_state.blending_enabled && !format_supports_blending) {
if (pipeline_descriptor.blending_enabled && !format_supports_blending) {
shader_debug_printf(
"[Warning] Attempting to Bake PSO, but MTLPixelFormat %d does not support "
"blending\n",
@ -1069,8 +1118,8 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
}
}
}
desc.depthAttachmentPixelFormat = current_state.depth_attachment_format;
desc.stencilAttachmentPixelFormat = current_state.stencil_attachment_format;
desc.depthAttachmentPixelFormat = pipeline_descriptor.depth_attachment_format;
desc.stencilAttachmentPixelFormat = pipeline_descriptor.stencil_attachment_format;
/* Compile PSO */
MTLAutoreleasedRenderPipelineReflection reflection_data;
@ -1090,7 +1139,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
return nullptr;
}
else {
#ifndef NDEBUG
#if 0
NSLog(@"Successfully compiled PSO for shader: %s (Metal Context: %p)\n", this->name, ctx);
#endif
}
@ -1103,7 +1152,7 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
pso_inst->base_uniform_buffer_index = MTL_uniform_buffer_base_index;
pso_inst->null_attribute_buffer_index = (using_null_buffer) ? null_buffer_index : -1;
pso_inst->transform_feedback_buffer_index = MTL_transform_feedback_buffer_index;
pso_inst->shader_pso_index = pso_cache_.size();
pso_inst->prim_type = prim_type;
pso_inst->reflection_data_available = (reflection_data != nil);
if (reflection_data != nil) {
@ -1189,9 +1238,14 @@ MTLRenderPipelineStateInstance *MTLShader::bake_current_pipeline_state(
[pso_inst->pso retain];
/* Insert into pso cache. */
pso_cache_lock_.lock();
pso_inst->shader_pso_index = pso_cache_.size();
pso_cache_.add(pipeline_descriptor, pso_inst);
shader_debug_printf("PSO CACHE: Stored new variant in PSO cache for shader '%s'\n",
this->name);
pso_cache_lock_.unlock();
shader_debug_printf(
"PSO CACHE: Stored new variant in PSO cache for shader '%s' Hash: '%llu'\n",
this->name,
pipeline_descriptor.hash());
return pso_inst;
}
}
@ -1256,7 +1310,7 @@ bool MTLShader::bake_compute_pipeline_state(MTLContext *ctx)
return false;
}
else {
#ifndef NDEBUG
#if 0
NSLog(@"Successfully compiled compute PSO for shader: %s (Metal Context: %p)\n",
this->name,
ctx);

View File

@ -47,6 +47,7 @@ class GLShader : public Shader {
void fragment_shader_from_glsl(MutableSpan<const char *> sources) override;
void compute_shader_from_glsl(MutableSpan<const char *> sources) override;
bool finalize(const shader::ShaderCreateInfo *info = nullptr) override;
void warm_cache(int limit) override{};
std::string resources_declare(const shader::ShaderCreateInfo &info) const override;
std::string vertex_interface_declare(const shader::ShaderCreateInfo &info) const override;

View File

@ -35,6 +35,7 @@ class VKShader : public Shader {
void fragment_shader_from_glsl(MutableSpan<const char *> sources) override;
void compute_shader_from_glsl(MutableSpan<const char *> sources) override;
bool finalize(const shader::ShaderCreateInfo *info = nullptr) override;
void warm_cache(int limit) override{};
void transform_feedback_names_set(Span<const char *> name_list,
eGPUShaderTFBType geom_type) override;