blender-archive/source/blender/gpu/metal/mtl_memory.hh

/* SPDX-License-Identifier: GPL-2.0-or-later */

#pragma once

#include <atomic>
#include <functional>
#include <map>
#include <mutex>
#include <set>
#include <unordered_map>

#include "mtl_common.hh"

#include <Cocoa/Cocoa.h>
#include <Metal/Metal.h>
#include <QuartzCore/QuartzCore.h>

@class CAMetalLayer;
@class MTLCommandQueue;
@class MTLRenderPipelineState;

/* Metal Memory Manager Overview. */
/*
 * The Metal Backend Memory manager is designed to provide an interface
 * for all other MTL_* modules where memory allocation is required.
 *
 * Different allocation strategies and data-structures are used depending
 * on how the data is used by the backend. These aim to optimally handle
 * system memory and abstract away any complexity from the MTL_* modules
 * themselves.
 *
 * There are two primary allocation modes which can be used:
 *
 * ** MTLScratchBufferManager **
 *
 *    Each MTLContext owns a ScratchBufferManager which is implemented
 *    as a pool of circular buffers, designed to handle temporary
 *    memory allocations which occur on a per-frame basis. The scratch
 *    buffers allow flushing of host memory to the GPU to be batched.
 *
 *    Each frame, the next scratch buffer is reset, then later flushed upon
 *    command buffer submission.
 *
 *    NOTE: This is allocated per-context due to allocations being tied
 *    to workload submissions and context-specific submissions.
 *
 *    Examples of scratch buffer usage are:
 *      - Immediate-mode temporary vertex buffers.
 *      - Shader uniform data updates
 *      - Staging of data for resource copies, or, data reads/writes.
 *
 *  Usage:
 *
 *    MTLContext::get_scratchbuffer_manager() - to fetch active manager.
 *
 *    MTLTemporaryBuffer scratch_buffer_allocate_range(size)
 *    MTLTemporaryBuffer scratch_buffer_allocate_range_aligned(size, align)
 *
 * ---------------------------------------------------------------------------------
 *  ** MTLBufferPool **
 *
 *    For static and longer-lasting memory allocations, such as those for UBOs,
 *    Vertex buffers, index buffers, etc; We want an optimal abstraction for
 *    fetching a MTLBuffer of the desired size and resource options.
 *
 *    Memory allocations can be expensive so the MTLBufferPool provides
 *    functionality to track usage of these buffers and once a buffer
 *    is no longer in use, it is returned to the buffer pool for use
 *    by another backend resource.
 *
 *    The MTLBufferPool provides functionality for safe tracking of resources,
 *    as buffers freed on the host side must have their usage by the GPU tracked,
 *    to ensure they are not prematurely re-used before they have finished being
 *    used by the GPU.
 *
 *    NOTE: The MTLBufferPool is a global construct which can be fetched from anywhere.
 *
 *  Usage:
 *    MTLContext::get_global_memory_manager();  - static routine to fetch global memory manager.
 *
 *    gpu::MTLBuffer *allocate(size, is_cpu_visibile)
 *    gpu::MTLBuffer *allocate_aligned(size, alignment, is_cpu_visibile)
 *    gpu::MTLBuffer *allocate_with_data(size, is_cpu_visibile, data_ptr)
 *    gpu::MTLBuffer *allocate_aligned_with_data(size, alignment, is_cpu_visibile, data_ptr)
 */

/* Debug memory statistics: Disabled by Macro rather than guarded for
 * performance considerations. */
#define MTL_DEBUG_MEMORY_STATISTICS 0

/* Allows a scratch buffer to temporarily grow beyond its maximum, which allows submission
 * of one-time-use data packets which are too large. */
#define MTL_SCRATCH_BUFFER_ALLOW_TEMPORARY_EXPANSION 1

namespace blender::gpu {

/* Forward Declarations. */
class MTLContext;
class MTLCommandBufferManager;
class MTLUniformBuf;

/* -------------------------------------------------------------------- */
/** \name Memory Management.
 * \{ */

/* MTLBuffer allocation wrapper. */
class MTLBuffer {

 private:
  /* Metal resource. */
  id<MTLBuffer> metal_buffer_;

  /* Host-visible mapped-memory pointer. Behavior depends on buffer type:
   * - Shared buffers: pointer represents base address of #MTLBuffer whose data
   *                   access has shared access by both the CPU and GPU on
   *                   Unified Memory Architectures (UMA).
   * - Managed buffer: Host-side mapped buffer region for CPU (Host) access. Managed buffers
   *                   must be manually flushed to transfer data to GPU-resident buffer.
   * - Private buffer: Host access is invalid, `data` will be nullptr. */
  void *data_;

  /* Whether buffer is allocated from an external source. */
  bool is_external_ = false;

  /* Allocation info. */
  MTLResourceOptions options_;
  id<MTLDevice> device_;
  uint64_t alignment_;
  uint64_t size_;

  /* Allocated size may be larger than actual size. */
  uint64_t usage_size_;

  /* Lifetime info - whether the current buffer is actively in use. A buffer
   * should be in use after it has been allocated. De-allocating the buffer, and
   * returning it to the free buffer pool will set in_use to false. Using a buffer
   * while it is not in-use should not be allowed and result in an error. */
  std::atomic<bool> in_use_;

 public:
  MTLBuffer(id<MTLDevice> device, uint64_t size, MTLResourceOptions options, uint alignment = 1);
  MTLBuffer(id<MTLBuffer> external_buffer);
  ~MTLBuffer();

  /* Fetch information about backing MTLBuffer. */
  id<MTLBuffer> get_metal_buffer() const;
  void *get_host_ptr() const;
  uint64_t get_size_used() const;
  uint64_t get_size() const;

  /* Flush data to GPU. */
  void flush();
  void flush_range(uint64_t offset, uint64_t length);
  bool requires_flush();

  /* Buffer usage tracking. */
  void flag_in_use(bool used);
  bool get_in_use();
  void set_usage_size(uint64_t size_used);

  /* Debug. */
  void set_label(NSString *str);

  /* Read properties. */
  MTLResourceOptions get_resource_options();
  uint64_t get_alignment();

  /* Resource-local free: For buffers allocated via memory manager,
   * this will call the context `free_buffer` method to return the buffer to the context memory
   * pool.
   *
   * Otherwise, free will release the associated metal resource.
   * As a note, calling the destructor will also destroy the buffer and associated metal
   * resource. */
  void free();

  /* Safety check to ensure buffers are not used after free. */
  void debug_ensure_used();
};

/* View into part of an MTLBuffer. */
struct MTLBufferRange {
  id<MTLBuffer> metal_buffer;
  void *data;
  uint64_t buffer_offset;
  uint64_t size;
  MTLResourceOptions options;

  void flush();
  bool requires_flush();
};

/* Circular scratch buffer allocations should be seen as temporary and only used within the
 * lifetime of the frame. */
using MTLTemporaryBuffer = MTLBufferRange;

/* Round-Robin Circular-buffer. */
class MTLCircularBuffer {
  friend class MTLScratchBufferManager;

 private:
  MTLContext &own_context_;

  /* Wrapped MTLBuffer allocation handled. */
  gpu::MTLBuffer *cbuffer_;

  /* Current offset where next allocation will begin. */
  uint64_t current_offset_;

  /* Whether the Circular Buffer can grow during re-allocation if
   * the size is exceeded. */
  bool can_resize_;

  /* Usage information. */
  uint64_t used_frame_index_;
  uint64_t last_flush_base_offset_;

 public:
  MTLCircularBuffer(MTLContext &ctx, uint64_t initial_size, bool allow_grow);
  ~MTLCircularBuffer();
  MTLTemporaryBuffer allocate_range(uint64_t alloc_size);
  MTLTemporaryBuffer allocate_range_aligned(uint64_t alloc_size, uint alignment);
  void flush();

  /* Reset pointer back to start of circular buffer. */
  void reset();
};

/* Wrapper struct used by Memory Manager to sort and compare gpu::MTLBuffer resources inside the
 * memory pools. */
struct MTLBufferHandle {
  gpu::MTLBuffer *buffer;
  uint64_t buffer_size;

  inline MTLBufferHandle(gpu::MTLBuffer *buf)
  {
    this->buffer = buf;
    this->buffer_size = this->buffer->get_size();
  }

  inline MTLBufferHandle(uint64_t compare_size)
  {
    this->buffer = nullptr;
    this->buffer_size = compare_size;
  }
};

struct CompareMTLBuffer {
  bool operator()(const MTLBufferHandle &lhs, const MTLBufferHandle &rhs) const
  {
    return lhs.buffer_size < rhs.buffer_size;
  }
};

/**
 * An #MTLSafeFreeList is a temporary list of #gpu::MTLBuffers which have
 * been freed by the high level backend, but are pending GPU work execution before
 * the #gpu::MTLBuffers can be returned to the Memory manager pools.
 * This list is implemented as a chunked linked-list.
 *
 * Only a single #MTLSafeFreeList is active at one time and is associated with current command
 * buffer submissions. If an #MTLBuffer is freed during the lifetime of a command buffer, it could
 * still possibly be in-use and as such, the #MTLSafeFreeList will increment its reference count
 * for each command buffer submitted while the current pool is active.
 *
 * - Reference count is incremented upon #MTLCommandBuffer commit.
 * - Reference count is decremented in the #MTLCommandBuffer completion callback handler.
 *
 * A new #MTLSafeFreeList will begin each render step (frame). This pooling of buffers, rather than
 * individual buffer resource tracking reduces performance overhead.
 *
 * - The reference count starts at 1 to ensure that the reference count cannot prematurely reach
 *   zero until any command buffers have been submitted. This additional decrement happens
 *   when the next #MTLSafeFreeList is created, to allow the existing pool to be released once
 *   the reference count hits zero after submitted command buffers complete.
 *
 * NOTE: the Metal API independently tracks resources used by command buffers for the purpose of
 * keeping resources alive while in-use by the driver and CPU, however, this differs from the
 * #MTLSafeFreeList mechanism in the Metal backend, which exists for the purpose of allowing
 * previously allocated #MTLBuffer resources to be re-used. This allows us to save on the expensive
 * cost of memory allocation.
 */
class MTLSafeFreeList {
  friend class MTLBufferPool;

 private:
  std::atomic<int> reference_count_;
  std::atomic<bool> in_free_queue_;
  std::atomic<bool> referenced_by_workload_;
  std::recursive_mutex lock_;
  /* Linked list of next MTLSafeFreeList chunk if current chunk is full. */
  std::atomic<MTLSafeFreeList *> next_;

  /* Lockless list. MAX_NUM_BUFFERS_ within a chunk based on considerations
   * for performance and memory. Higher chunk counts are preferable for efficiently
   * performing block operations such as copying several objects simultaneously.
   *
   * MIN_BUFFER_FLUSH_COUNT refers to the minimum count of buffers in the MTLSafeFreeList
   * before buffers are returned to global memory pool. This is set at a point to reduce
   * overhead of small pool flushes, while ensuring floating memory overhead is not excessive. */
  static const int MAX_NUM_BUFFERS_ = 8192;
  static const int MIN_BUFFER_FLUSH_COUNT = 120;
  std::atomic<int> current_list_index_;
  gpu::MTLBuffer *safe_free_pool_[MAX_NUM_BUFFERS_];

 public:
  MTLSafeFreeList();

  /* Can be used from multiple threads. Performs insertion into Safe Free List with the least
   * amount of threading synchronization. */
  void insert_buffer(gpu::MTLBuffer *buffer);

  /* Whether we need to start a new safe free list, or can carry on using the existing one. */
  bool should_flush();

  /* Increments command buffer reference count. */
  void increment_reference();

  /* Decrement and return of buffers to pool occur on MTLCommandBuffer completion callback. */
  void decrement_reference();

  void flag_in_queue()
  {
    in_free_queue_ = true;
    if (current_list_index_ >= MTLSafeFreeList::MAX_NUM_BUFFERS_) {
      MTLSafeFreeList *next_pool = next_.load();
      if (next_pool) {
        next_pool->flag_in_queue();
      }
    }
  }
};

/* MTLBuffer pools. */
/* Allocating Metal buffers is expensive, so we cache all allocated buffers,
 * and when requesting a new buffer, find one which fits the required dimensions
 * from an existing pool of buffers.
 *
 * When freeing MTLBuffers, we insert them into the current MTLSafeFreeList, which defers
 * release of the buffer until the associated command buffers have finished executing.
 * This prevents a buffer from being re-used while it is still in-use by the GPU.
 *
 * * Once command buffers complete, MTLSafeFreeList's associated with the current
 *   command buffer submission are added to the `completed_safelist_queue_`.
 *
 * * At a set point in time, all MTLSafeFreeList's in `completed_safelist_queue_` have their
 *   MTLBuffers re-inserted into the Memory Manager's pools. */
class MTLBufferPool {

 private:
#if MTL_DEBUG_MEMORY_STATISTICS == 1
  /* Memory statistics. */
  std::atomic<int64_t> total_allocation_bytes_;

  /* Debug statistics. */
  std::atomic<int> per_frame_allocation_count_;
  std::atomic<int64_t> allocations_in_pool_;
  std::atomic<int64_t> buffers_in_pool_;
#endif

  /* Metal resources. */
  bool ensure_initialised_ = false;
  id<MTLDevice> device_ = nil;

  /* The buffer selection aims to pick a buffer which meets the minimum size requirements.
   * To do this, we keep an ordered set of all available buffers. If the buffer is larger than the
   * desired allocation size, we check it against `mtl_buffer_size_threshold_factor_`,
   * which defines what % larger than the original allocation the buffer can be.
   * - A higher value results in greater re-use of previously allocated buffers of similar sizes.
   * - A lower value may result in more dynamic allocations, but minimized memory usage for a given
   *   scenario.
   * The current value of 1.26 is calibrated for optimal performance and memory utilization. */
  static constexpr float mtl_buffer_size_threshold_factor_ = 1.26;

  /* Buffer pools using MTLResourceOptions as key for allocation type.
   * Aliased as 'uint64_t' for map type compatibility.
   * - A size-ordered list (MultiSet) of allocated buffers is kept per MTLResourceOptions
   *   permutation. This allows efficient lookup for buffers of a given requested size.
   * - MTLBufferHandle wraps a gpu::MTLBuffer pointer to achieve easy size-based sorting
   *   via CompareMTLBuffer.
   *
   * NOTE: buffer_pool_lock_ guards against concurrent access to the memory allocator. This
   * can occur during light baking or rendering operations. */
  using MTLBufferPoolOrderedList = std::multiset<MTLBufferHandle, CompareMTLBuffer>;
  using MTLBufferResourceOptions = uint64_t;

  std::mutex buffer_pool_lock_;
  blender::Map<MTLBufferResourceOptions, MTLBufferPoolOrderedList *> buffer_pools_;
  blender::Vector<gpu::MTLBuffer *> allocations_;

  /* Maintain a queue of all MTLSafeFreeList's that have been released
   * by the GPU and are ready to have their buffers re-inserted into the
   * MemoryManager pools.
   * Access to this queue is made thread-safe through safelist_lock_. */
  std::mutex safelist_lock_;
  blender::Vector<MTLSafeFreeList *> completed_safelist_queue_;

  /* Current free list, associated with active MTLCommandBuffer submission. */
  /* MTLBuffer::free() can be called from separate threads, due to usage within animation
   * system/worker threads. */
  std::atomic<MTLSafeFreeList *> current_free_list_;

 public:
  void init(id<MTLDevice> device);
  ~MTLBufferPool();

  gpu::MTLBuffer *allocate(uint64_t size, bool cpu_visible);
  gpu::MTLBuffer *allocate_aligned(uint64_t size, uint alignment, bool cpu_visible);
  gpu::MTLBuffer *allocate_with_data(uint64_t size, bool cpu_visible, const void *data = nullptr);
  gpu::MTLBuffer *allocate_aligned_with_data(uint64_t size,
                                             uint alignment,
                                             bool cpu_visible,
                                             const void *data = nullptr);
  bool free_buffer(gpu::MTLBuffer *buffer);

  /* Flush MTLSafeFreeList buffers, for completed lists in `completed_safelist_queue_`,
   * back to memory pools. */
  void update_memory_pools();

  /* Access and control over active MTLSafeFreeList. */
  MTLSafeFreeList *get_current_safe_list();
  void begin_new_safe_list();

  /* Add a completed MTLSafeFreeList to completed_safelist_queue_. */
  void push_completed_safe_list(MTLSafeFreeList *list);

 private:
  void ensure_buffer_pool(MTLResourceOptions options);
  void insert_buffer_into_pool(MTLResourceOptions options, gpu::MTLBuffer *buffer);
  void free();
};

/* Scratch buffers are circular-buffers used for temporary data within the current frame.
 * In order to preserve integrity of contents when having multiple-frames-in-flight,
 * we cycle through a collection of scratch buffers which are reset upon next use.
 *
 * Below are a series of properties, declared to manage scratch buffers. If a scratch buffer
 * overflows, then the original buffer will be flushed and submitted, with retained references
 * by usage within the command buffer, and a new buffer will be created.
 * - The new buffer will grow in size to account for increased demand in temporary memory.
 */
class MTLScratchBufferManager {

 private:
  /* Maximum number of scratch buffers to allocate. This should be the maximum number of
   * simultaneous frames in flight. */
  static constexpr uint mtl_max_scratch_buffers_ = MTL_NUM_SAFE_FRAMES;

 public:
  /* Maximum size of single scratch buffer allocation. When re-sizing, this is the maximum size the
   * newly allocated buffers will grow to. Larger allocations are possible if
   * `MTL_SCRATCH_BUFFER_ALLOW_TEMPORARY_EXPANSION` is enabled, but these will instead allocate new
   * buffers from the memory pools on the fly. */
  static constexpr uint mtl_scratch_buffer_max_size_ = 128 * 1024 * 1024;

  /* Initial size of circular scratch buffers prior to growth. */
  static constexpr uint mtl_scratch_buffer_initial_size_ = 16 * 1024 * 1024;

 private:
  /* Parent MTLContext. */
  MTLContext &context_;
  bool initialised_ = false;

  /* Scratch buffer currently in-use. */
  uint current_scratch_buffer_ = 0;

  /* Scratch buffer pool. */
  MTLCircularBuffer *scratch_buffers_[mtl_max_scratch_buffers_];

 public:
  MTLScratchBufferManager(MTLContext &context) : context_(context){};
  ~MTLScratchBufferManager();

  /* Explicit initialization and freeing of resources.
   * Initialization must occur after device creation. */
  void init();
  void free();

  /* Allocation functions for creating temporary allocations from active circular buffer. */
  MTLTemporaryBuffer scratch_buffer_allocate_range(uint64_t alloc_size);
  MTLTemporaryBuffer scratch_buffer_allocate_range_aligned(uint64_t alloc_size, uint alignment);

  /* Ensure a new scratch buffer is started if we move onto a new frame.
   * Called when a new command buffer begins. */
  void ensure_increment_scratch_buffer();

  /* Flush memory for active scratch buffer to GPU.
   * This call will perform a partial flush of the buffer starting from
   * the last offset the data was flushed from, to the current offset. */
  void flush_active_scratch_buffer();
};

/** \} */

}  // namespace blender::gpu