496 lines
19 KiB
C++
496 lines
19 KiB
C++
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
|
|
#pragma once
|
|
|
|
#include <atomic>
|
|
#include <functional>
|
|
#include <map>
|
|
#include <mutex>
|
|
#include <set>
|
|
#include <unordered_map>
|
|
|
|
#include "mtl_common.hh"
|
|
|
|
#include <Cocoa/Cocoa.h>
|
|
#include <Metal/Metal.h>
|
|
#include <QuartzCore/QuartzCore.h>
|
|
|
|
@class CAMetalLayer;
|
|
@class MTLCommandQueue;
|
|
@class MTLRenderPipelineState;
|
|
|
|
/* Metal Memory Manager Overview. */
|
|
/*
|
|
* The Metal Backend Memory manager is designed to provide an interface
|
|
* for all other MTL_* modules where memory allocation is required.
|
|
*
|
|
* Different allocation strategies and data-structures are used depending
|
|
* on how the data is used by the backend. These aim to optimally handle
|
|
* system memory and abstract away any complexity from the MTL_* modules
|
|
* themselves.
|
|
*
|
|
* There are two primary allocation modes which can be used:
|
|
*
|
|
* ** MTLScratchBufferManager **
|
|
*
|
|
* Each MTLContext owns a ScratchBufferManager which is implemented
|
|
* as a pool of circular buffers, designed to handle temporary
|
|
* memory allocations which occur on a per-frame basis. The scratch
|
|
* buffers allow flushing of host memory to the GPU to be batched.
|
|
*
|
|
* Each frame, the next scratch buffer is reset, then later flushed upon
|
|
* command buffer submission.
|
|
*
|
|
* NOTE: This is allocated per-context due to allocations being tied
|
|
* to workload submissions and context-specific submissions.
|
|
*
|
|
* Examples of scratch buffer usage are:
|
|
* - Immediate-mode temporary vertex buffers.
|
|
* - Shader uniform data updates
|
|
* - Staging of data for resource copies, or, data reads/writes.
|
|
*
|
|
* Usage:
|
|
*
|
|
* MTLContext::get_scratchbuffer_manager() - to fetch active manager.
|
|
*
|
|
* MTLTemporaryBuffer scratch_buffer_allocate_range(size)
|
|
* MTLTemporaryBuffer scratch_buffer_allocate_range_aligned(size, align)
|
|
*
|
|
* ---------------------------------------------------------------------------------
|
|
* ** MTLBufferPool **
|
|
*
|
|
* For static and longer-lasting memory allocations, such as those for UBOs,
|
|
* Vertex buffers, index buffers, etc; We want an optimal abstraction for
|
|
* fetching a MTLBuffer of the desired size and resource options.
|
|
*
|
|
* Memory allocations can be expensive so the MTLBufferPool provides
|
|
* functionality to track usage of these buffers and once a buffer
|
|
* is no longer in use, it is returned to the buffer pool for use
|
|
* by another backend resource.
|
|
*
|
|
* The MTLBufferPool provides functionality for safe tracking of resources,
|
|
* as buffers freed on the host side must have their usage by the GPU tracked,
|
|
* to ensure they are not prematurely re-used before they have finished being
|
|
* used by the GPU.
|
|
*
|
|
* NOTE: The MTLBufferPool is a global construct which can be fetched from anywhere.
|
|
*
|
|
* Usage:
|
|
* MTLContext::get_global_memory_manager(); - static routine to fetch global memory manager.
|
|
*
|
|
* gpu::MTLBuffer *allocate(size, is_cpu_visibile)
|
|
* gpu::MTLBuffer *allocate_aligned(size, alignment, is_cpu_visibile)
|
|
* gpu::MTLBuffer *allocate_with_data(size, is_cpu_visibile, data_ptr)
|
|
* gpu::MTLBuffer *allocate_aligned_with_data(size, alignment, is_cpu_visibile, data_ptr)
|
|
*/
|
|
|
|
/* Debug memory statistics: Disabled by Macro rather than guarded for
|
|
* performance considerations. */
|
|
#define MTL_DEBUG_MEMORY_STATISTICS 0
|
|
|
|
/* Allows a scratch buffer to temporarily grow beyond its maximum, which allows submission
|
|
* of one-time-use data packets which are too large. */
|
|
#define MTL_SCRATCH_BUFFER_ALLOW_TEMPORARY_EXPANSION 1
|
|
|
|
namespace blender::gpu {
|
|
|
|
/* Forward Declarations. */
|
|
class MTLContext;
|
|
class MTLCommandBufferManager;
|
|
class MTLUniformBuf;
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
/** \name Memory Management.
|
|
* \{ */
|
|
|
|
/* MTLBuffer allocation wrapper. */
|
|
class MTLBuffer {
|
|
|
|
private:
|
|
/* Metal resource. */
|
|
id<MTLBuffer> metal_buffer_;
|
|
|
|
/* Host-visible mapped-memory pointer. Behavior depends on buffer type:
|
|
* - Shared buffers: pointer represents base address of #MTLBuffer whose data
|
|
* access has shared access by both the CPU and GPU on
|
|
* Unified Memory Architectures (UMA).
|
|
* - Managed buffer: Host-side mapped buffer region for CPU (Host) access. Managed buffers
|
|
* must be manually flushed to transfer data to GPU-resident buffer.
|
|
* - Private buffer: Host access is invalid, `data` will be nullptr. */
|
|
void *data_;
|
|
|
|
/* Whether buffer is allocated from an external source. */
|
|
bool is_external_ = false;
|
|
|
|
/* Allocation info. */
|
|
MTLResourceOptions options_;
|
|
id<MTLDevice> device_;
|
|
uint64_t alignment_;
|
|
uint64_t size_;
|
|
|
|
/* Allocated size may be larger than actual size. */
|
|
uint64_t usage_size_;
|
|
|
|
/* Lifetime info - whether the current buffer is actively in use. A buffer
|
|
* should be in use after it has been allocated. De-allocating the buffer, and
|
|
* returning it to the free buffer pool will set in_use to false. Using a buffer
|
|
* while it is not in-use should not be allowed and result in an error. */
|
|
std::atomic<bool> in_use_;
|
|
|
|
public:
|
|
MTLBuffer(id<MTLDevice> device, uint64_t size, MTLResourceOptions options, uint alignment = 1);
|
|
MTLBuffer(id<MTLBuffer> external_buffer);
|
|
~MTLBuffer();
|
|
|
|
/* Fetch information about backing MTLBuffer. */
|
|
id<MTLBuffer> get_metal_buffer() const;
|
|
void *get_host_ptr() const;
|
|
uint64_t get_size_used() const;
|
|
uint64_t get_size() const;
|
|
|
|
/* Flush data to GPU. */
|
|
void flush();
|
|
void flush_range(uint64_t offset, uint64_t length);
|
|
bool requires_flush();
|
|
|
|
/* Buffer usage tracking. */
|
|
void flag_in_use(bool used);
|
|
bool get_in_use();
|
|
void set_usage_size(uint64_t size_used);
|
|
|
|
/* Debug. */
|
|
void set_label(NSString *str);
|
|
|
|
/* Read properties. */
|
|
MTLResourceOptions get_resource_options();
|
|
uint64_t get_alignment();
|
|
|
|
/* Resource-local free: For buffers allocated via memory manager,
|
|
* this will call the context `free_buffer` method to return the buffer to the context memory
|
|
* pool.
|
|
*
|
|
* Otherwise, free will release the associated metal resource.
|
|
* As a note, calling the destructor will also destroy the buffer and associated metal
|
|
* resource. */
|
|
void free();
|
|
|
|
/* Safety check to ensure buffers are not used after free. */
|
|
void debug_ensure_used();
|
|
};
|
|
|
|
/* View into part of an MTLBuffer. */
|
|
struct MTLBufferRange {
|
|
id<MTLBuffer> metal_buffer;
|
|
void *data;
|
|
uint64_t buffer_offset;
|
|
uint64_t size;
|
|
MTLResourceOptions options;
|
|
|
|
void flush();
|
|
bool requires_flush();
|
|
};
|
|
|
|
/* Circular scratch buffer allocations should be seen as temporary and only used within the
|
|
* lifetime of the frame. */
|
|
using MTLTemporaryBuffer = MTLBufferRange;
|
|
|
|
/* Round-Robin Circular-buffer. */
|
|
class MTLCircularBuffer {
|
|
friend class MTLScratchBufferManager;
|
|
|
|
private:
|
|
MTLContext &own_context_;
|
|
|
|
/* Wrapped MTLBuffer allocation handled. */
|
|
gpu::MTLBuffer *cbuffer_;
|
|
|
|
/* Current offset where next allocation will begin. */
|
|
uint64_t current_offset_;
|
|
|
|
/* Whether the Circular Buffer can grow during re-allocation if
|
|
* the size is exceeded. */
|
|
bool can_resize_;
|
|
|
|
/* Usage information. */
|
|
uint64_t used_frame_index_;
|
|
uint64_t last_flush_base_offset_;
|
|
|
|
public:
|
|
MTLCircularBuffer(MTLContext &ctx, uint64_t initial_size, bool allow_grow);
|
|
~MTLCircularBuffer();
|
|
MTLTemporaryBuffer allocate_range(uint64_t alloc_size);
|
|
MTLTemporaryBuffer allocate_range_aligned(uint64_t alloc_size, uint alignment);
|
|
void flush();
|
|
|
|
/* Reset pointer back to start of circular buffer. */
|
|
void reset();
|
|
};
|
|
|
|
/* Wrapper struct used by Memory Manager to sort and compare gpu::MTLBuffer resources inside the
|
|
* memory pools. */
|
|
struct MTLBufferHandle {
|
|
gpu::MTLBuffer *buffer;
|
|
uint64_t buffer_size;
|
|
|
|
inline MTLBufferHandle(gpu::MTLBuffer *buf)
|
|
{
|
|
this->buffer = buf;
|
|
this->buffer_size = this->buffer->get_size();
|
|
}
|
|
|
|
inline MTLBufferHandle(uint64_t compare_size)
|
|
{
|
|
this->buffer = nullptr;
|
|
this->buffer_size = compare_size;
|
|
}
|
|
};
|
|
|
|
struct CompareMTLBuffer {
|
|
bool operator()(const MTLBufferHandle &lhs, const MTLBufferHandle &rhs) const
|
|
{
|
|
return lhs.buffer_size < rhs.buffer_size;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* An #MTLSafeFreeList is a temporary list of #gpu::MTLBuffers which have
|
|
* been freed by the high level backend, but are pending GPU work execution before
|
|
* the #gpu::MTLBuffers can be returned to the Memory manager pools.
|
|
* This list is implemented as a chunked linked-list.
|
|
*
|
|
* Only a single #MTLSafeFreeList is active at one time and is associated with current command
|
|
* buffer submissions. If an #MTLBuffer is freed during the lifetime of a command buffer, it could
|
|
* still possibly be in-use and as such, the #MTLSafeFreeList will increment its reference count
|
|
* for each command buffer submitted while the current pool is active.
|
|
*
|
|
* - Reference count is incremented upon #MTLCommandBuffer commit.
|
|
* - Reference count is decremented in the #MTLCommandBuffer completion callback handler.
|
|
*
|
|
* A new #MTLSafeFreeList will begin each render step (frame). This pooling of buffers, rather than
|
|
* individual buffer resource tracking reduces performance overhead.
|
|
*
|
|
* - The reference count starts at 1 to ensure that the reference count cannot prematurely reach
|
|
* zero until any command buffers have been submitted. This additional decrement happens
|
|
* when the next #MTLSafeFreeList is created, to allow the existing pool to be released once
|
|
* the reference count hits zero after submitted command buffers complete.
|
|
*
|
|
* NOTE: the Metal API independently tracks resources used by command buffers for the purpose of
|
|
* keeping resources alive while in-use by the driver and CPU, however, this differs from the
|
|
* #MTLSafeFreeList mechanism in the Metal backend, which exists for the purpose of allowing
|
|
* previously allocated #MTLBuffer resources to be re-used. This allows us to save on the expensive
|
|
* cost of memory allocation.
|
|
*/
|
|
class MTLSafeFreeList {
|
|
friend class MTLBufferPool;
|
|
|
|
private:
|
|
std::atomic<int> reference_count_;
|
|
std::atomic<bool> in_free_queue_;
|
|
std::atomic<bool> referenced_by_workload_;
|
|
std::recursive_mutex lock_;
|
|
/* Linked list of next MTLSafeFreeList chunk if current chunk is full. */
|
|
std::atomic<MTLSafeFreeList *> next_;
|
|
|
|
/* Lockless list. MAX_NUM_BUFFERS_ within a chunk based on considerations
|
|
* for performance and memory. Higher chunk counts are preferable for efficiently
|
|
* performing block operations such as copying several objects simultaneously.
|
|
*
|
|
* MIN_BUFFER_FLUSH_COUNT refers to the minimum count of buffers in the MTLSafeFreeList
|
|
* before buffers are returned to global memory pool. This is set at a point to reduce
|
|
* overhead of small pool flushes, while ensuring floating memory overhead is not excessive. */
|
|
static const int MAX_NUM_BUFFERS_ = 8192;
|
|
static const int MIN_BUFFER_FLUSH_COUNT = 120;
|
|
std::atomic<int> current_list_index_;
|
|
gpu::MTLBuffer *safe_free_pool_[MAX_NUM_BUFFERS_];
|
|
|
|
public:
|
|
MTLSafeFreeList();
|
|
|
|
/* Can be used from multiple threads. Performs insertion into Safe Free List with the least
|
|
* amount of threading synchronization. */
|
|
void insert_buffer(gpu::MTLBuffer *buffer);
|
|
|
|
/* Whether we need to start a new safe free list, or can carry on using the existing one. */
|
|
bool should_flush();
|
|
|
|
/* Increments command buffer reference count. */
|
|
void increment_reference();
|
|
|
|
/* Decrement and return of buffers to pool occur on MTLCommandBuffer completion callback. */
|
|
void decrement_reference();
|
|
|
|
void flag_in_queue()
|
|
{
|
|
in_free_queue_ = true;
|
|
if (current_list_index_ >= MTLSafeFreeList::MAX_NUM_BUFFERS_) {
|
|
MTLSafeFreeList *next_pool = next_.load();
|
|
if (next_pool) {
|
|
next_pool->flag_in_queue();
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
/* MTLBuffer pools. */
|
|
/* Allocating Metal buffers is expensive, so we cache all allocated buffers,
|
|
* and when requesting a new buffer, find one which fits the required dimensions
|
|
* from an existing pool of buffers.
|
|
*
|
|
* When freeing MTLBuffers, we insert them into the current MTLSafeFreeList, which defers
|
|
* release of the buffer until the associated command buffers have finished executing.
|
|
* This prevents a buffer from being re-used while it is still in-use by the GPU.
|
|
*
|
|
* * Once command buffers complete, MTLSafeFreeList's associated with the current
|
|
* command buffer submission are added to the `completed_safelist_queue_`.
|
|
*
|
|
* * At a set point in time, all MTLSafeFreeList's in `completed_safelist_queue_` have their
|
|
* MTLBuffers re-inserted into the Memory Manager's pools. */
|
|
class MTLBufferPool {
|
|
|
|
private:
|
|
#if MTL_DEBUG_MEMORY_STATISTICS == 1
|
|
/* Memory statistics. */
|
|
std::atomic<int64_t> total_allocation_bytes_;
|
|
|
|
/* Debug statistics. */
|
|
std::atomic<int> per_frame_allocation_count_;
|
|
std::atomic<int64_t> allocations_in_pool_;
|
|
std::atomic<int64_t> buffers_in_pool_;
|
|
#endif
|
|
|
|
/* Metal resources. */
|
|
bool ensure_initialised_ = false;
|
|
id<MTLDevice> device_ = nil;
|
|
|
|
/* The buffer selection aims to pick a buffer which meets the minimum size requirements.
|
|
* To do this, we keep an ordered set of all available buffers. If the buffer is larger than the
|
|
* desired allocation size, we check it against `mtl_buffer_size_threshold_factor_`,
|
|
* which defines what % larger than the original allocation the buffer can be.
|
|
* - A higher value results in greater re-use of previously allocated buffers of similar sizes.
|
|
* - A lower value may result in more dynamic allocations, but minimized memory usage for a given
|
|
* scenario.
|
|
* The current value of 1.26 is calibrated for optimal performance and memory utilization. */
|
|
static constexpr float mtl_buffer_size_threshold_factor_ = 1.26;
|
|
|
|
/* Buffer pools using MTLResourceOptions as key for allocation type.
|
|
* Aliased as 'uint64_t' for map type compatibility.
|
|
* - A size-ordered list (MultiSet) of allocated buffers is kept per MTLResourceOptions
|
|
* permutation. This allows efficient lookup for buffers of a given requested size.
|
|
* - MTLBufferHandle wraps a gpu::MTLBuffer pointer to achieve easy size-based sorting
|
|
* via CompareMTLBuffer.
|
|
*
|
|
* NOTE: buffer_pool_lock_ guards against concurrent access to the memory allocator. This
|
|
* can occur during light baking or rendering operations. */
|
|
using MTLBufferPoolOrderedList = std::multiset<MTLBufferHandle, CompareMTLBuffer>;
|
|
using MTLBufferResourceOptions = uint64_t;
|
|
|
|
std::mutex buffer_pool_lock_;
|
|
blender::Map<MTLBufferResourceOptions, MTLBufferPoolOrderedList *> buffer_pools_;
|
|
blender::Vector<gpu::MTLBuffer *> allocations_;
|
|
|
|
/* Maintain a queue of all MTLSafeFreeList's that have been released
|
|
* by the GPU and are ready to have their buffers re-inserted into the
|
|
* MemoryManager pools.
|
|
* Access to this queue is made thread-safe through safelist_lock_. */
|
|
std::mutex safelist_lock_;
|
|
blender::Vector<MTLSafeFreeList *> completed_safelist_queue_;
|
|
|
|
/* Current free list, associated with active MTLCommandBuffer submission. */
|
|
/* MTLBuffer::free() can be called from separate threads, due to usage within animation
|
|
* system/worker threads. */
|
|
std::atomic<MTLSafeFreeList *> current_free_list_;
|
|
|
|
public:
|
|
void init(id<MTLDevice> device);
|
|
~MTLBufferPool();
|
|
|
|
gpu::MTLBuffer *allocate(uint64_t size, bool cpu_visible);
|
|
gpu::MTLBuffer *allocate_aligned(uint64_t size, uint alignment, bool cpu_visible);
|
|
gpu::MTLBuffer *allocate_with_data(uint64_t size, bool cpu_visible, const void *data = nullptr);
|
|
gpu::MTLBuffer *allocate_aligned_with_data(uint64_t size,
|
|
uint alignment,
|
|
bool cpu_visible,
|
|
const void *data = nullptr);
|
|
bool free_buffer(gpu::MTLBuffer *buffer);
|
|
|
|
/* Flush MTLSafeFreeList buffers, for completed lists in `completed_safelist_queue_`,
|
|
* back to memory pools. */
|
|
void update_memory_pools();
|
|
|
|
/* Access and control over active MTLSafeFreeList. */
|
|
MTLSafeFreeList *get_current_safe_list();
|
|
void begin_new_safe_list();
|
|
|
|
/* Add a completed MTLSafeFreeList to completed_safelist_queue_. */
|
|
void push_completed_safe_list(MTLSafeFreeList *list);
|
|
|
|
private:
|
|
void ensure_buffer_pool(MTLResourceOptions options);
|
|
void insert_buffer_into_pool(MTLResourceOptions options, gpu::MTLBuffer *buffer);
|
|
void free();
|
|
};
|
|
|
|
/* Scratch buffers are circular-buffers used for temporary data within the current frame.
|
|
* In order to preserve integrity of contents when having multiple-frames-in-flight,
|
|
* we cycle through a collection of scratch buffers which are reset upon next use.
|
|
*
|
|
* Below are a series of properties, declared to manage scratch buffers. If a scratch buffer
|
|
* overflows, then the original buffer will be flushed and submitted, with retained references
|
|
* by usage within the command buffer, and a new buffer will be created.
|
|
* - The new buffer will grow in size to account for increased demand in temporary memory.
|
|
*/
|
|
class MTLScratchBufferManager {
|
|
|
|
private:
|
|
/* Maximum number of scratch buffers to allocate. This should be the maximum number of
|
|
* simultaneous frames in flight. */
|
|
static constexpr uint mtl_max_scratch_buffers_ = MTL_NUM_SAFE_FRAMES;
|
|
|
|
public:
|
|
/* Maximum size of single scratch buffer allocation. When re-sizing, this is the maximum size the
|
|
* newly allocated buffers will grow to. Larger allocations are possible if
|
|
* `MTL_SCRATCH_BUFFER_ALLOW_TEMPORARY_EXPANSION` is enabled, but these will instead allocate new
|
|
* buffers from the memory pools on the fly. */
|
|
static constexpr uint mtl_scratch_buffer_max_size_ = 128 * 1024 * 1024;
|
|
|
|
/* Initial size of circular scratch buffers prior to growth. */
|
|
static constexpr uint mtl_scratch_buffer_initial_size_ = 16 * 1024 * 1024;
|
|
|
|
private:
|
|
/* Parent MTLContext. */
|
|
MTLContext &context_;
|
|
bool initialised_ = false;
|
|
|
|
/* Scratch buffer currently in-use. */
|
|
uint current_scratch_buffer_ = 0;
|
|
|
|
/* Scratch buffer pool. */
|
|
MTLCircularBuffer *scratch_buffers_[mtl_max_scratch_buffers_];
|
|
|
|
public:
|
|
MTLScratchBufferManager(MTLContext &context) : context_(context){};
|
|
~MTLScratchBufferManager();
|
|
|
|
/* Explicit initialization and freeing of resources.
|
|
* Initialization must occur after device creation. */
|
|
void init();
|
|
void free();
|
|
|
|
/* Allocation functions for creating temporary allocations from active circular buffer. */
|
|
MTLTemporaryBuffer scratch_buffer_allocate_range(uint64_t alloc_size);
|
|
MTLTemporaryBuffer scratch_buffer_allocate_range_aligned(uint64_t alloc_size, uint alignment);
|
|
|
|
/* Ensure a new scratch buffer is started if we move onto a new frame.
|
|
* Called when a new command buffer begins. */
|
|
void ensure_increment_scratch_buffer();
|
|
|
|
/* Flush memory for active scratch buffer to GPU.
|
|
* This call will perform a partial flush of the buffer starting from
|
|
* the last offset the data was flushed from, to the current offset. */
|
|
void flush_active_scratch_buffer();
|
|
};
|
|
|
|
/** \} */
|
|
|
|
} // namespace blender::gpu
|