Metal: MTLIndexBuf class implementation.
Implementation also contains a number of optimisations and feature enablements specific to the Metal API and Apple Silicon GPUs. Ref T96261 Reviewed By: fclem Maniphest Tasks: T96261 Differential Revision: https://developer.blender.org/D15369
This commit is contained in:
@@ -191,6 +191,7 @@ set(METAL_SRC
|
||||
metal/mtl_context.mm
|
||||
metal/mtl_debug.mm
|
||||
metal/mtl_framebuffer.mm
|
||||
metal/mtl_index_buffer.mm
|
||||
metal/mtl_memory.mm
|
||||
metal/mtl_query.mm
|
||||
metal/mtl_state.mm
|
||||
@@ -204,6 +205,7 @@ set(METAL_SRC
|
||||
metal/mtl_context.hh
|
||||
metal/mtl_debug.hh
|
||||
metal/mtl_framebuffer.hh
|
||||
metal/mtl_index_buffer.hh
|
||||
metal/mtl_memory.hh
|
||||
metal/mtl_query.hh
|
||||
metal/mtl_state.hh
|
||||
|
@@ -26,6 +26,9 @@ typedef struct GPUIndexBufBuilder {
|
||||
uint index_len;
|
||||
uint index_min;
|
||||
uint index_max;
|
||||
uint restart_index_value;
|
||||
bool uses_restart_indices;
|
||||
|
||||
GPUPrimType prim_type;
|
||||
uint32_t *data;
|
||||
} GPUIndexBufBuilder;
|
||||
|
@@ -9,6 +9,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "BLI_assert.h"
|
||||
#include "GPU_common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -42,6 +43,79 @@ typedef enum {
|
||||
GPU_PRIM_CLASS_ANY = GPU_PRIM_CLASS_POINT | GPU_PRIM_CLASS_LINE | GPU_PRIM_CLASS_SURFACE,
|
||||
} GPUPrimClass;
|
||||
|
||||
inline int gpu_get_prim_count_from_type(uint vertex_len, GPUPrimType prim_type)
|
||||
{
|
||||
/* does vertex_len make sense for this primitive type? */
|
||||
if (vertex_len == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
switch (prim_type) {
|
||||
case GPU_PRIM_POINTS:
|
||||
return vertex_len;
|
||||
|
||||
case GPU_PRIM_LINES:
|
||||
BLI_assert(vertex_len % 2 == 0);
|
||||
return vertex_len / 2;
|
||||
|
||||
case GPU_PRIM_LINE_STRIP:
|
||||
return vertex_len - 1;
|
||||
|
||||
case GPU_PRIM_LINE_LOOP:
|
||||
return vertex_len;
|
||||
|
||||
case GPU_PRIM_LINES_ADJ:
|
||||
BLI_assert(vertex_len % 4 == 0);
|
||||
return vertex_len / 4;
|
||||
|
||||
case GPU_PRIM_LINE_STRIP_ADJ:
|
||||
return vertex_len - 2;
|
||||
|
||||
case GPU_PRIM_TRIS:
|
||||
BLI_assert(vertex_len % 3 == 0);
|
||||
return vertex_len / 3;
|
||||
|
||||
case GPU_PRIM_TRI_STRIP:
|
||||
BLI_assert(vertex_len >= 3);
|
||||
return vertex_len - 2;
|
||||
|
||||
case GPU_PRIM_TRI_FAN:
|
||||
BLI_assert(vertex_len >= 3);
|
||||
return vertex_len - 2;
|
||||
|
||||
case GPU_PRIM_TRIS_ADJ:
|
||||
BLI_assert(vertex_len % 6 == 0);
|
||||
return vertex_len / 6;
|
||||
|
||||
default:
|
||||
BLI_assert_unreachable();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool is_restart_compatible(GPUPrimType type)
|
||||
{
|
||||
switch (type) {
|
||||
case GPU_PRIM_POINTS:
|
||||
case GPU_PRIM_LINES:
|
||||
case GPU_PRIM_TRIS:
|
||||
case GPU_PRIM_LINES_ADJ:
|
||||
case GPU_PRIM_TRIS_ADJ:
|
||||
case GPU_PRIM_NONE:
|
||||
default: {
|
||||
return false;
|
||||
}
|
||||
case GPU_PRIM_LINE_STRIP:
|
||||
case GPU_PRIM_LINE_LOOP:
|
||||
case GPU_PRIM_TRI_STRIP:
|
||||
case GPU_PRIM_TRI_FAN:
|
||||
case GPU_PRIM_LINE_STRIP_ADJ: {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO: Improve error checking by validating that the shader is suited for this primitive type.
|
||||
* GPUPrimClass GPU_primtype_class(GPUPrimType);
|
||||
|
@@ -16,6 +16,8 @@
|
||||
|
||||
#include "gpu_index_buffer_private.hh"
|
||||
|
||||
#include "GPU_platform.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#define KEEP_SINGLE_COPY 1
|
||||
@@ -40,6 +42,28 @@ void GPU_indexbuf_init_ex(GPUIndexBufBuilder *builder,
|
||||
builder->index_min = UINT32_MAX;
|
||||
builder->index_max = 0;
|
||||
builder->prim_type = prim_type;
|
||||
|
||||
#ifdef __APPLE__
|
||||
/* Only encode restart indices for restart-compatible primitive types.
|
||||
* Resolves out-of-bounds read error on macOS. Using 0-index will ensure
|
||||
* degenerative primitives when skipping primitives is required and will
|
||||
* incur no additional performance cost for rendering. */
|
||||
if (GPU_type_matches_ex(GPU_DEVICE_ANY, GPU_OS_MAC, GPU_DRIVER_ANY, GPU_BACKEND_METAL)) {
|
||||
/* We will still use restart-indices for point primtives and then
|
||||
* patch these during IndexBuf::init, as we cannot benefit from degenerative
|
||||
* primitives to eliminate these. */
|
||||
builder->restart_index_value = (is_restart_compatible(prim_type) ||
|
||||
prim_type == GPU_PRIM_POINTS) ?
|
||||
RESTART_INDEX :
|
||||
0;
|
||||
}
|
||||
else {
|
||||
builder->restart_index_value = RESTART_INDEX;
|
||||
}
|
||||
#else
|
||||
builder->restart_index_value = RESTART_INDEX;
|
||||
#endif
|
||||
builder->uses_restart_indices = false;
|
||||
builder->data = (uint *)MEM_callocN(builder->max_index_len * sizeof(uint), "GPUIndexBuf data");
|
||||
}
|
||||
|
||||
@@ -94,7 +118,8 @@ void GPU_indexbuf_add_primitive_restart(GPUIndexBufBuilder *builder)
|
||||
assert(builder->data != nullptr);
|
||||
assert(builder->index_len < builder->max_index_len);
|
||||
#endif
|
||||
builder->data[builder->index_len++] = RESTART_INDEX;
|
||||
builder->data[builder->index_len++] = builder->restart_index_value;
|
||||
builder->uses_restart_indices = true;
|
||||
}
|
||||
|
||||
void GPU_indexbuf_add_point_vert(GPUIndexBufBuilder *builder, uint v)
|
||||
@@ -186,8 +211,9 @@ void GPU_indexbuf_set_point_restart(GPUIndexBufBuilder *builder, uint elem)
|
||||
{
|
||||
BLI_assert(builder->prim_type == GPU_PRIM_POINTS);
|
||||
BLI_assert(elem < builder->max_index_len);
|
||||
builder->data[elem++] = RESTART_INDEX;
|
||||
builder->data[elem++] = builder->restart_index_value;
|
||||
builder->index_len = MAX2(builder->index_len, elem);
|
||||
builder->uses_restart_indices = true;
|
||||
}
|
||||
|
||||
void GPU_indexbuf_set_line_restart(GPUIndexBufBuilder *builder, uint elem)
|
||||
@@ -195,9 +221,10 @@ void GPU_indexbuf_set_line_restart(GPUIndexBufBuilder *builder, uint elem)
|
||||
BLI_assert(builder->prim_type == GPU_PRIM_LINES);
|
||||
BLI_assert((elem + 1) * 2 <= builder->max_index_len);
|
||||
uint idx = elem * 2;
|
||||
builder->data[idx++] = RESTART_INDEX;
|
||||
builder->data[idx++] = RESTART_INDEX;
|
||||
builder->data[idx++] = builder->restart_index_value;
|
||||
builder->data[idx++] = builder->restart_index_value;
|
||||
builder->index_len = MAX2(builder->index_len, idx);
|
||||
builder->uses_restart_indices = true;
|
||||
}
|
||||
|
||||
void GPU_indexbuf_set_tri_restart(GPUIndexBufBuilder *builder, uint elem)
|
||||
@@ -205,10 +232,11 @@ void GPU_indexbuf_set_tri_restart(GPUIndexBufBuilder *builder, uint elem)
|
||||
BLI_assert(builder->prim_type == GPU_PRIM_TRIS);
|
||||
BLI_assert((elem + 1) * 3 <= builder->max_index_len);
|
||||
uint idx = elem * 3;
|
||||
builder->data[idx++] = RESTART_INDEX;
|
||||
builder->data[idx++] = RESTART_INDEX;
|
||||
builder->data[idx++] = RESTART_INDEX;
|
||||
builder->data[idx++] = builder->restart_index_value;
|
||||
builder->data[idx++] = builder->restart_index_value;
|
||||
builder->data[idx++] = builder->restart_index_value;
|
||||
builder->index_len = MAX2(builder->index_len, idx);
|
||||
builder->uses_restart_indices = true;
|
||||
}
|
||||
|
||||
/** \} */
|
||||
@@ -226,7 +254,12 @@ IndexBuf::~IndexBuf()
|
||||
}
|
||||
}
|
||||
|
||||
void IndexBuf::init(uint indices_len, uint32_t *indices, uint min_index, uint max_index)
|
||||
void IndexBuf::init(uint indices_len,
|
||||
uint32_t *indices,
|
||||
uint min_index,
|
||||
uint max_index,
|
||||
GPUPrimType prim_type,
|
||||
bool uses_restart_indices)
|
||||
{
|
||||
is_init_ = true;
|
||||
data_ = indices;
|
||||
@@ -234,6 +267,21 @@ void IndexBuf::init(uint indices_len, uint32_t *indices, uint min_index, uint ma
|
||||
index_len_ = indices_len;
|
||||
is_empty_ = min_index > max_index;
|
||||
|
||||
/* Patch index buffer to remove restart indices from
|
||||
* non-restart-compatible primitive types. Restart indices
|
||||
* are situationally added to selectively hide vertices.
|
||||
* Metal does not support restart-indices for non-restart-compatible
|
||||
* types, as such we should remove these indices.
|
||||
*
|
||||
* We only need to perform this for point primitives, as
|
||||
* line primitives/triangle primitives can use index 0 for all
|
||||
* vertices to create a degenerative primitive, where all
|
||||
* vertices share the same index and skip rendering via HW
|
||||
* culling. */
|
||||
if (prim_type == GPU_PRIM_POINTS && uses_restart_indices) {
|
||||
this->strip_restart_indices();
|
||||
}
|
||||
|
||||
#if GPU_TRACK_INDEX_RANGE
|
||||
/* Everything remains 32 bit while building to keep things simple.
|
||||
* Find min/max after, then convert to smallest index type possible. */
|
||||
@@ -243,7 +291,18 @@ void IndexBuf::init(uint indices_len, uint32_t *indices, uint min_index, uint ma
|
||||
|
||||
if (range <= 0xFFFF) {
|
||||
index_type_ = GPU_INDEX_U16;
|
||||
this->squeeze_indices_short(min_index, max_index);
|
||||
bool do_clamp_indices = false;
|
||||
# ifdef __APPLE__
|
||||
/* NOTE: For the Metal Backend, we use degenerative primitives to hide vertices
|
||||
* which are not restart compatible. When this is done, we need to ensure
|
||||
* that compressed index ranges clamp all index values within the valid
|
||||
* range, rather than maximally clamping against the USHORT restart index
|
||||
* value of 0xFFFFu, as this will cause an out-of-bounds read during
|
||||
* vertex assembly. */
|
||||
do_clamp_indices = GPU_type_matches_ex(
|
||||
GPU_DEVICE_ANY, GPU_OS_MAC, GPU_DRIVER_ANY, GPU_BACKEND_METAL);
|
||||
# endif
|
||||
this->squeeze_indices_short(min_index, max_index, prim_type, do_clamp_indices);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -302,7 +361,10 @@ uint IndexBuf::index_range(uint *r_min, uint *r_max)
|
||||
return max_value - min_value;
|
||||
}
|
||||
|
||||
void IndexBuf::squeeze_indices_short(uint min_idx, uint max_idx)
|
||||
void IndexBuf::squeeze_indices_short(uint min_idx,
|
||||
uint max_idx,
|
||||
GPUPrimType prim_type,
|
||||
bool clamp_indices_in_range)
|
||||
{
|
||||
/* data will never be *larger* than builder->data...
|
||||
* converting in place to avoid extra allocation */
|
||||
@@ -311,8 +373,22 @@ void IndexBuf::squeeze_indices_short(uint min_idx, uint max_idx)
|
||||
|
||||
if (max_idx >= 0xFFFF) {
|
||||
index_base_ = min_idx;
|
||||
/* NOTE: When using restart_index=0 for degenerative primitives indices,
|
||||
* the compressed index will go below zero and wrap around when min_idx > 0.
|
||||
* In order to ensure the resulting index is still within range, we instead
|
||||
* clamp index to the maximum within the index range.
|
||||
*
|
||||
* `clamp_max_idx` represents the maximum possible index to clamp against. If primitive is
|
||||
* restart-compatible, we can just clamp against the primtive-restart value, otherwise, we
|
||||
* must assign to a valid index within the range.
|
||||
*
|
||||
* NOTE: For OpenGL we skip this by disabling clamping, as we still need to use
|
||||
* restart index values for point primitives to disable rendering. */
|
||||
uint16_t clamp_max_idx = (is_restart_compatible(prim_type) || !clamp_indices_in_range) ?
|
||||
0xFFFFu :
|
||||
(max_idx - min_idx);
|
||||
for (uint i = 0; i < index_len_; i++) {
|
||||
ushort_idx[i] = (uint16_t)MIN2(0xFFFF, uint_idx[i] - min_idx);
|
||||
ushort_idx[i] = (uint16_t)MIN2(clamp_max_idx, uint_idx[i] - min_idx);
|
||||
}
|
||||
}
|
||||
else {
|
||||
@@ -363,7 +439,12 @@ void GPU_indexbuf_build_in_place(GPUIndexBufBuilder *builder, GPUIndexBuf *elem)
|
||||
BLI_assert(builder->data != nullptr);
|
||||
/* Transfer data ownership to GPUIndexBuf.
|
||||
* It will be uploaded upon first use. */
|
||||
unwrap(elem)->init(builder->index_len, builder->data, builder->index_min, builder->index_max);
|
||||
unwrap(elem)->init(builder->index_len,
|
||||
builder->data,
|
||||
builder->index_min,
|
||||
builder->index_max,
|
||||
builder->prim_type,
|
||||
builder->uses_restart_indices);
|
||||
builder->data = nullptr;
|
||||
}
|
||||
|
||||
|
@@ -59,7 +59,12 @@ class IndexBuf {
|
||||
IndexBuf(){};
|
||||
virtual ~IndexBuf();
|
||||
|
||||
void init(uint indices_len, uint32_t *indices, uint min_index, uint max_index);
|
||||
void init(uint indices_len,
|
||||
uint32_t *indices,
|
||||
uint min_index,
|
||||
uint max_index,
|
||||
GPUPrimType prim_type,
|
||||
bool uses_restart_indices);
|
||||
void init_subrange(IndexBuf *elem_src, uint start, uint length);
|
||||
void init_build_on_device(uint index_len);
|
||||
|
||||
@@ -99,8 +104,12 @@ class IndexBuf {
|
||||
virtual void update_sub(uint start, uint len, const void *data) = 0;
|
||||
|
||||
private:
|
||||
inline void squeeze_indices_short(uint min_idx, uint max_idx);
|
||||
inline void squeeze_indices_short(uint min_idx,
|
||||
uint max_idx,
|
||||
GPUPrimType prim_type,
|
||||
bool clamp_indices_in_range);
|
||||
inline uint index_range(uint *r_min, uint *r_max);
|
||||
virtual void strip_restart_indices() = 0;
|
||||
};
|
||||
|
||||
/* Syntactic sugar. */
|
||||
|
@@ -16,7 +16,6 @@ namespace blender::gpu {
|
||||
class Batch;
|
||||
class DrawList;
|
||||
class FrameBuffer;
|
||||
class IndexBuf;
|
||||
class QueryPool;
|
||||
class Shader;
|
||||
class UniformBuf;
|
||||
|
@@ -10,6 +10,7 @@
|
||||
#include "mtl_backend.hh"
|
||||
#include "mtl_context.hh"
|
||||
#include "mtl_framebuffer.hh"
|
||||
#include "mtl_index_buffer.hh"
|
||||
#include "mtl_query.hh"
|
||||
#include "mtl_uniform_buffer.hh"
|
||||
|
||||
@@ -60,8 +61,7 @@ FrameBuffer *MTLBackend::framebuffer_alloc(const char *name)
|
||||
|
||||
IndexBuf *MTLBackend::indexbuf_alloc()
|
||||
{
|
||||
/* TODO(Metal): Implement MTLIndexBuf. */
|
||||
return nullptr;
|
||||
return new MTLIndexBuf();
|
||||
};
|
||||
|
||||
QueryPool *MTLBackend::querypool_alloc()
|
||||
|
@@ -3,7 +3,6 @@
|
||||
/** \file
|
||||
* \ingroup gpu
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "MEM_guardedalloc.h"
|
||||
|
79
source/blender/gpu/metal/mtl_index_buffer.hh
Normal file
79
source/blender/gpu/metal/mtl_index_buffer.hh
Normal file
@@ -0,0 +1,79 @@
|
||||
|
||||
/** \file
|
||||
* \ingroup gpu
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "MEM_guardedalloc.h"
|
||||
#include "gpu_index_buffer_private.hh"
|
||||
#include "mtl_context.hh"
|
||||
#include <Cocoa/Cocoa.h>
|
||||
#include <Metal/Metal.h>
|
||||
#include <QuartzCore/QuartzCore.h>
|
||||
|
||||
namespace blender::gpu {
|
||||
|
||||
class MTLIndexBuf : public IndexBuf {
|
||||
friend class MTLBatch;
|
||||
friend class MTLDrawList;
|
||||
|
||||
private:
|
||||
/* Metal buffer resource. */
|
||||
gpu::MTLBuffer *ibo_ = nullptr;
|
||||
uint64_t alloc_size_ = 0;
|
||||
|
||||
#ifndef NDEBUG
|
||||
/* Flags whether point index buffer has been compacted
|
||||
* to remove false retart indices. */
|
||||
bool point_restarts_stripped_ = false;
|
||||
#endif
|
||||
|
||||
/* Optimised index buffers.
|
||||
* NOTE(Metal): This optimization encodes a new index buffer following
|
||||
* TriangleList topology. Parsing of Index buffers is more optimal
|
||||
* when not using restart-compatible primitive topology types. */
|
||||
GPUPrimType optimized_primitive_type_;
|
||||
gpu::MTLBuffer *optimized_ibo_ = nullptr;
|
||||
uint32_t emulated_v_count = 0;
|
||||
void free_optimized_buffer();
|
||||
|
||||
/* Flags whether an index buffer can be optimized.
|
||||
* For index buffers which are partially modified
|
||||
* on the host, or by the GPU, optimization cannot be performed. */
|
||||
bool can_optimize_ = true;
|
||||
|
||||
public:
|
||||
~MTLIndexBuf();
|
||||
|
||||
void bind_as_ssbo(uint32_t binding) override;
|
||||
const uint32_t *read() const override;
|
||||
|
||||
void upload_data() override;
|
||||
void update_sub(uint32_t start, uint32_t len, const void *data) override;
|
||||
|
||||
/* get_index_buffer can conditionally return an optimized index buffer of a
|
||||
* differing format, if it is concluded that optimization is preferred
|
||||
* for the given inputs.
|
||||
* Index buffer optimization is used to replace restart-compatbiele
|
||||
* primitive types with non-restart-compatible ones such as TriangleList and
|
||||
* LineList. This improves GPU execution for these types significantly, while
|
||||
* only incuring a small performance penalty.
|
||||
*
|
||||
* This is also used to emulate unsupported topology types
|
||||
* such as triangle fan. */
|
||||
id<MTLBuffer> get_index_buffer(GPUPrimType &in_out_primitive_type, uint &in_out_v_count);
|
||||
void flag_can_optimize(bool can_optimize);
|
||||
|
||||
static MTLIndexType gpu_index_type_to_metal(GPUIndexBufType type)
|
||||
{
|
||||
return (type == GPU_INDEX_U16) ? MTLIndexTypeUInt16 : MTLIndexTypeUInt32;
|
||||
}
|
||||
|
||||
private:
|
||||
void strip_restart_indices() override;
|
||||
|
||||
MEM_CXX_CLASS_ALLOC_FUNCS("MTLIndexBuf")
|
||||
};
|
||||
|
||||
} // namespace blender::gpu
|
516
source/blender/gpu/metal/mtl_index_buffer.mm
Normal file
516
source/blender/gpu/metal/mtl_index_buffer.mm
Normal file
@@ -0,0 +1,516 @@
|
||||
|
||||
/** \file
|
||||
* \ingroup gpu
|
||||
*/
|
||||
#include "mtl_index_buffer.hh"
|
||||
#include "mtl_context.hh"
|
||||
#include "mtl_debug.hh"
|
||||
|
||||
#include "BLI_span.hh"
|
||||
|
||||
namespace blender::gpu {
|
||||
|
||||
/* -------------------------------------------------------------------- */
|
||||
/** \name Core MTLIndexBuf implementation.
|
||||
* \{ */
|
||||
|
||||
MTLIndexBuf::~MTLIndexBuf()
|
||||
{
|
||||
if (ibo_ != nullptr && !this->is_subrange_) {
|
||||
ibo_->free();
|
||||
}
|
||||
this->free_optimized_buffer();
|
||||
}
|
||||
|
||||
void MTLIndexBuf::free_optimized_buffer()
|
||||
{
|
||||
if (optimized_ibo_) {
|
||||
optimized_ibo_->free();
|
||||
optimized_ibo_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void MTLIndexBuf::bind_as_ssbo(uint32_t binding)
|
||||
{
|
||||
/* Flag buffer as incompatible with optimized/patched buffers as contents
|
||||
* can now have partial modifications from the GPU. */
|
||||
this->flag_can_optimize(false);
|
||||
this->free_optimized_buffer();
|
||||
|
||||
/* Ensure we have a valid IBO. */
|
||||
BLI_assert(this->ibo_);
|
||||
|
||||
/* TODO(Metal): Support index buffer SSBOs. Dependent on compute impl. */
|
||||
MTL_LOG_WARNING("MTLIndexBuf::bind_as_ssbo not yet implemented!\n");
|
||||
}
|
||||
|
||||
const uint32_t *MTLIndexBuf::read() const
|
||||
{
|
||||
if (ibo_ != nullptr) {
|
||||
|
||||
/* Return host pointer. */
|
||||
void *data = ibo_->get_host_ptr();
|
||||
return static_cast<uint32_t *>(data);
|
||||
}
|
||||
BLI_assert(false && "Index buffer not ready to be read.");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void MTLIndexBuf::upload_data()
|
||||
{
|
||||
/* Handle subrange upload. */
|
||||
if (is_subrange_) {
|
||||
MTLIndexBuf *mtlsrc = static_cast<MTLIndexBuf *>(src_);
|
||||
mtlsrc->upload_data();
|
||||
|
||||
#ifndef NDEBUG
|
||||
BLI_assert_msg(!mtlsrc->point_restarts_stripped_,
|
||||
"Cannot use subrange on stripped point buffer.");
|
||||
#endif
|
||||
|
||||
/* If parent subrange allocation has changed,
|
||||
* update our index buffer. */
|
||||
if (alloc_size_ != mtlsrc->alloc_size_ || ibo_ != mtlsrc->ibo_) {
|
||||
|
||||
/* Update index buffer and allocation from source. */
|
||||
alloc_size_ = mtlsrc->alloc_size_;
|
||||
ibo_ = mtlsrc->ibo_;
|
||||
|
||||
/* Reset any allocated patched or optimized index buffers. */
|
||||
this->free_optimized_buffer();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* If new data ready, and index buffer already exists, release current. */
|
||||
if ((ibo_ != nullptr) && (this->data_ != nullptr)) {
|
||||
MTL_LOG_INFO("Re-creating index buffer with new data. IndexBuf %p\n", this);
|
||||
ibo_->free();
|
||||
ibo_ = nullptr;
|
||||
}
|
||||
|
||||
/* Prepare Buffer and Upload Data. */
|
||||
if (ibo_ == nullptr && data_ != nullptr) {
|
||||
alloc_size_ = this->size_get();
|
||||
if (alloc_size_ == 0) {
|
||||
MTL_LOG_WARNING("[Metal] Warning! Trying to allocate index buffer with size=0 bytes\n");
|
||||
}
|
||||
else {
|
||||
ibo_ = MTLContext::get_global_memory_manager().allocate_with_data(alloc_size_, true, data_);
|
||||
BLI_assert(ibo_);
|
||||
ibo_->set_label(@"Index Buffer");
|
||||
}
|
||||
|
||||
/* No need to keep copy of data_ in system memory. */
|
||||
MEM_SAFE_FREE(data_);
|
||||
}
|
||||
}
|
||||
|
||||
void MTLIndexBuf::update_sub(uint32_t start, uint32_t len, const void *data)
|
||||
{
|
||||
BLI_assert(!is_subrange_);
|
||||
|
||||
/* If host-side data still exists, modify and upload as normal */
|
||||
if (data_ != nullptr) {
|
||||
|
||||
/* Free index buffer if one exists. */
|
||||
if (ibo_ != nullptr && !this->is_subrange_) {
|
||||
ibo_->free();
|
||||
ibo_ = nullptr;
|
||||
}
|
||||
|
||||
BLI_assert(start + len < this->size_get());
|
||||
|
||||
/* Apply start byte offset to data pointer. */
|
||||
void *modified_base_ptr = data_;
|
||||
uint8_t *ptr = static_cast<uint8_t *>(modified_base_ptr);
|
||||
ptr += start;
|
||||
modified_base_ptr = static_cast<void *>(ptr);
|
||||
|
||||
/* Modify host-side data. */
|
||||
memcpy(modified_base_ptr, data, len);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Verify buffer. */
|
||||
BLI_assert(ibo_ != nullptr);
|
||||
|
||||
/* Otherwise, we will inject a data update, using staged data, into the command stream.
|
||||
* Stage update contents in temporary buffer*/
|
||||
MTLContext *ctx = static_cast<MTLContext *>(unwrap(GPU_context_active_get()));
|
||||
BLI_assert(ctx);
|
||||
MTLTemporaryBuffer range = ctx->get_scratchbuffer_manager().scratch_buffer_allocate_range(len);
|
||||
memcpy(range.data, data, len);
|
||||
|
||||
/* Copy updated contents into primary buffer.
|
||||
* These changes need to be uploaded via blit to ensure the data copies happen in-order. */
|
||||
id<MTLBuffer> dest_buffer = ibo_->get_metal_buffer();
|
||||
BLI_assert(dest_buffer != nil);
|
||||
|
||||
id<MTLBlitCommandEncoder> enc = ctx->main_command_buffer.ensure_begin_blit_encoder();
|
||||
[enc copyFromBuffer:range.metal_buffer
|
||||
sourceOffset:(uint32_t)range.buffer_offset
|
||||
toBuffer:dest_buffer
|
||||
destinationOffset:start
|
||||
size:len];
|
||||
|
||||
/* Synchronise changes back to host to ensure CPU-side data is up-to-date for non
|
||||
* Shared buffers. */
|
||||
if (dest_buffer.storageMode == MTLStorageModeManaged) {
|
||||
[enc synchronizeResource:dest_buffer];
|
||||
}
|
||||
|
||||
/* Invalidate patched/optimized buffers. */
|
||||
this->free_optimized_buffer();
|
||||
|
||||
/* Flag buffer as incompatible with optimized/patched buffers as contents
|
||||
* have partial modifications. */
|
||||
this->flag_can_optimize(false);
|
||||
|
||||
BLI_assert(false);
|
||||
}
|
||||
|
||||
void MTLIndexBuf::flag_can_optimize(bool can_optimize)
|
||||
{
|
||||
can_optimize_ = can_optimize;
|
||||
}
|
||||
|
||||
/** \} */
|
||||
|
||||
/** \name Index buffer optimization and topology emulation.
|
||||
* Index buffer optimization and emulation. Optimise index buffers by
|
||||
* eliminating restart-indices.
|
||||
* Emulate unsupported index types e.g. Triangle Fan and Line Loop.
|
||||
* \{ */
|
||||
|
||||
/* Returns total vertices in new buffer. */
|
||||
template<typename T>
|
||||
static uint32_t populate_optimized_tri_strip_buf(Span<T> original_data,
|
||||
MutableSpan<T> output_data,
|
||||
uint32_t input_index_len)
|
||||
{
|
||||
/* Generate TriangleList from TriangleStrip. */
|
||||
uint32_t current_vert_len = 0;
|
||||
uint32_t current_output_ind = 0;
|
||||
T indices[3];
|
||||
|
||||
for (int c_index = 0; c_index < input_index_len; c_index++) {
|
||||
T current_index = original_data[c_index];
|
||||
if (current_index == T(-1)) {
|
||||
/* Stop current primitive. Move onto next. */
|
||||
current_vert_len = 0;
|
||||
}
|
||||
else {
|
||||
if (current_vert_len < 3) {
|
||||
/* prepare first triangle.
|
||||
* Cache indices before genrating a triangle,
|
||||
* in case we have bad primitive-restarts. */
|
||||
indices[current_vert_len] = current_index;
|
||||
}
|
||||
|
||||
/* emit triangle once we reach 3 input verts in current strip. */
|
||||
if (current_vert_len == 3) {
|
||||
/* First triangle in strip. */
|
||||
output_data[current_output_ind++] = indices[0];
|
||||
output_data[current_output_ind++] = indices[1];
|
||||
output_data[current_output_ind++] = indices[2];
|
||||
}
|
||||
else if (current_vert_len > 3) {
|
||||
/* All other triangles in strip.
|
||||
* These triangles are populated using data from previous 2 vertices
|
||||
* and the latest index. */
|
||||
uint32_t tri_id = current_vert_len - 3;
|
||||
uint32_t base_output_ind = current_output_ind;
|
||||
if ((tri_id % 2) == 0) {
|
||||
output_data[base_output_ind + 0] = output_data[base_output_ind - 2];
|
||||
output_data[base_output_ind + 1] = current_index;
|
||||
output_data[base_output_ind + 2] = output_data[base_output_ind - 1];
|
||||
}
|
||||
else {
|
||||
output_data[base_output_ind + 0] = output_data[base_output_ind - 1];
|
||||
output_data[base_output_ind + 1] = output_data[base_output_ind - 2];
|
||||
output_data[base_output_ind + 2] = current_index;
|
||||
}
|
||||
current_output_ind += 3;
|
||||
}
|
||||
|
||||
/* Increment relative vertex index. */
|
||||
current_vert_len++;
|
||||
}
|
||||
}
|
||||
return current_output_ind;
|
||||
}
|
||||
|
||||
/* Returns total vertices in new buffer. */
|
||||
template<typename T>
|
||||
static uint32_t populate_emulated_tri_fan_buf(Span<T> original_data,
|
||||
MutableSpan<T> output_data,
|
||||
uint32_t input_index_len)
|
||||
{
|
||||
/* Generate TriangleList from TriangleFan. */
|
||||
T base_prim_ind_val = 0;
|
||||
uint32_t current_vert_len = 0;
|
||||
uint32_t current_output_ind = 0;
|
||||
T indices[3];
|
||||
|
||||
for (int c_index = 0; c_index < input_index_len; c_index++) {
|
||||
T current_index = original_data[c_index];
|
||||
if (current_index == T(-1)) {
|
||||
/* Stop current primitive. Move onto next. */
|
||||
current_vert_len = 0;
|
||||
}
|
||||
else {
|
||||
if (current_vert_len < 3) {
|
||||
/* prepare first triangle.
|
||||
* Cache indices before genrating a triangle,
|
||||
* in case we have bad primitive-restarts. */
|
||||
indices[current_vert_len] = current_index;
|
||||
}
|
||||
|
||||
/* emit triangle once we reach 3 input verts in current strip. */
|
||||
if (current_vert_len == 3) {
|
||||
/* First triangle in strip. */
|
||||
output_data[current_output_ind++] = indices[0];
|
||||
output_data[current_output_ind++] = indices[1];
|
||||
output_data[current_output_ind++] = indices[2];
|
||||
base_prim_ind_val = indices[0];
|
||||
}
|
||||
else if (current_vert_len > 3) {
|
||||
/* All other triangles in strip.
|
||||
* These triangles are populated using data from previous 2 vertices
|
||||
* and the latest index. */
|
||||
uint32_t base_output_ind = current_output_ind;
|
||||
|
||||
output_data[base_output_ind + 0] = base_prim_ind_val;
|
||||
output_data[base_output_ind + 1] = output_data[base_output_ind - 1];
|
||||
output_data[base_output_ind + 2] = current_index;
|
||||
current_output_ind += 3;
|
||||
}
|
||||
|
||||
/* Increment relative vertex index. */
|
||||
current_vert_len++;
|
||||
}
|
||||
}
|
||||
return current_output_ind;
|
||||
}
|
||||
|
||||
id<MTLBuffer> MTLIndexBuf::get_index_buffer(GPUPrimType &in_out_primitive_type,
|
||||
uint32_t &in_out_v_count)
|
||||
{
|
||||
/* Determine whether to return the original index buffer, or whether we
|
||||
* should emulate an unsupported primitive type, or optimisze a restart-
|
||||
* compatible type for faster performance. */
|
||||
bool should_optimize_or_emulate = (in_out_primitive_type == GPU_PRIM_TRI_FAN) ||
|
||||
(in_out_primitive_type == GPU_PRIM_TRI_STRIP);
|
||||
if (!should_optimize_or_emulate || is_subrange_ || !can_optimize_) {
|
||||
/* Ensure we are not optimized. */
|
||||
BLI_assert(this->optimized_ibo_ == nullptr);
|
||||
|
||||
/* Return regular index buffer. */
|
||||
BLI_assert(this->ibo_ && this->ibo_->get_metal_buffer());
|
||||
return this->ibo_->get_metal_buffer();
|
||||
}
|
||||
|
||||
/* Perform optimization on type. */
|
||||
GPUPrimType input_prim_type = in_out_primitive_type;
|
||||
this->upload_data();
|
||||
if (!ibo_ && optimized_ibo_ == nullptr) {
|
||||
/* Cannot optimize buffer if no source IBO exists. */
|
||||
return nil;
|
||||
}
|
||||
|
||||
/* Verify whether existing index buffer is valid. */
|
||||
if (optimized_ibo_ != nullptr && optimized_primitive_type_ != input_prim_type) {
|
||||
BLI_assert_msg(false,
|
||||
"Cannot change the optimized primitive format after generation, as source "
|
||||
"index buffer data is discarded.");
|
||||
return nil;
|
||||
}
|
||||
|
||||
/* Generate optimized index buffer. */
|
||||
if (optimized_ibo_ == nullptr) {
|
||||
|
||||
/* Generate unwrapped index buffer. */
|
||||
switch (input_prim_type) {
|
||||
case GPU_PRIM_TRI_FAN: {
|
||||
|
||||
/* Calculate maximum size. */
|
||||
uint32_t max_possible_verts = (this->index_len_ - 2) * 3;
|
||||
BLI_assert(max_possible_verts > 0);
|
||||
|
||||
/* Allocate new buffer. */
|
||||
optimized_ibo_ = MTLContext::get_global_memory_manager().allocate(
|
||||
max_possible_verts *
|
||||
((index_type_ == GPU_INDEX_U16) ? sizeof(uint16_t) : sizeof(uint32_t)),
|
||||
true);
|
||||
|
||||
/* Populate new index buffer. */
|
||||
if (index_type_ == GPU_INDEX_U16) {
|
||||
Span<uint16_t> orig_data(static_cast<const uint16_t *>(ibo_->get_host_ptr()),
|
||||
this->index_len_);
|
||||
MutableSpan<uint16_t> output_data(
|
||||
static_cast<uint16_t *>(optimized_ibo_->get_host_ptr()), this->index_len_);
|
||||
emulated_v_count = populate_emulated_tri_fan_buf<uint16_t>(
|
||||
orig_data, output_data, this->index_len_);
|
||||
}
|
||||
else {
|
||||
Span<uint32_t> orig_data(static_cast<const uint32_t *>(ibo_->get_host_ptr()),
|
||||
this->index_len_);
|
||||
MutableSpan<uint32_t> output_data(
|
||||
static_cast<uint32_t *>(optimized_ibo_->get_host_ptr()), this->index_len_);
|
||||
emulated_v_count = populate_emulated_tri_fan_buf<uint32_t>(
|
||||
orig_data, output_data, this->index_len_);
|
||||
}
|
||||
|
||||
BLI_assert(emulated_v_count <= max_possible_verts);
|
||||
|
||||
/* Flush buffer and output. */
|
||||
optimized_ibo_->flush();
|
||||
optimized_primitive_type_ = input_prim_type;
|
||||
in_out_v_count = emulated_v_count;
|
||||
in_out_primitive_type = GPU_PRIM_TRIS;
|
||||
}
|
||||
|
||||
case GPU_PRIM_TRI_STRIP: {
|
||||
|
||||
/* Calculate maximum size. */
|
||||
uint32_t max_possible_verts = (this->index_len_ - 2) * 3;
|
||||
BLI_assert(max_possible_verts > 0);
|
||||
|
||||
/* Allocate new buffer. */
|
||||
optimized_ibo_ = MTLContext::get_global_memory_manager().allocate(
|
||||
max_possible_verts *
|
||||
((index_type_ == GPU_INDEX_U16) ? sizeof(uint16_t) : sizeof(uint32_t)),
|
||||
true);
|
||||
|
||||
/* Populate new index buffer. */
|
||||
if (index_type_ == GPU_INDEX_U16) {
|
||||
Span<uint16_t> orig_data(static_cast<const uint16_t *>(ibo_->get_host_ptr()),
|
||||
this->index_len_);
|
||||
MutableSpan<uint16_t> output_data(
|
||||
static_cast<uint16_t *>(optimized_ibo_->get_host_ptr()), this->index_len_);
|
||||
emulated_v_count = populate_optimized_tri_strip_buf<uint16_t>(
|
||||
orig_data, output_data, this->index_len_);
|
||||
}
|
||||
else {
|
||||
Span<uint32_t> orig_data(static_cast<const uint32_t *>(ibo_->get_host_ptr()),
|
||||
this->index_len_);
|
||||
MutableSpan<uint32_t> output_data(
|
||||
static_cast<uint32_t *>(optimized_ibo_->get_host_ptr()), this->index_len_);
|
||||
emulated_v_count = populate_optimized_tri_strip_buf<uint32_t>(
|
||||
orig_data, output_data, this->index_len_);
|
||||
}
|
||||
|
||||
BLI_assert(emulated_v_count <= max_possible_verts);
|
||||
|
||||
/* Flush buffer and output. */
|
||||
optimized_ibo_->flush();
|
||||
optimized_primitive_type_ = input_prim_type;
|
||||
in_out_v_count = emulated_v_count;
|
||||
in_out_primitive_type = GPU_PRIM_TRIS;
|
||||
} break;
|
||||
|
||||
case GPU_PRIM_LINE_STRIP: {
|
||||
/* TOOD(Metal): Line strip topology types would benefit from optimization to remove
|
||||
* primitive restarts, however, these do not occur frequently, nor with
|
||||
* significant geometry counts. */
|
||||
MTL_LOG_INFO("TODO: Primitive topology: Optimise line strip topology types\n");
|
||||
} break;
|
||||
|
||||
case GPU_PRIM_LINE_LOOP: {
|
||||
/* TOOD(Metal): Line Loop primitive type requires use of optimized index buffer for
|
||||
* emulation, if used with indexed rendering. This path is currently not hit as LineLoop
|
||||
* does not currently appear to be used alongisde an index buffer. */
|
||||
MTL_LOG_WARNING(
|
||||
"TODO: Primitive topology: Line Loop Index buffer optimization required for "
|
||||
"emulation.\n");
|
||||
} break;
|
||||
|
||||
case GPU_PRIM_TRIS:
|
||||
case GPU_PRIM_LINES:
|
||||
case GPU_PRIM_POINTS: {
|
||||
/* Should not get here - TRIS/LINES/POINTS do not require emulation or optimization. */
|
||||
BLI_assert_unreachable();
|
||||
return nil;
|
||||
}
|
||||
|
||||
default:
|
||||
/* Should not get here - Invalid primitive type. */
|
||||
BLI_assert_unreachable();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Return optimized buffer. */
|
||||
if (optimized_ibo_ != nullptr) {
|
||||
|
||||
/* Delete original buffer if one still exists, as we do no need it. */
|
||||
if (ibo_ != nullptr) {
|
||||
ibo_->free();
|
||||
ibo_ = nullptr;
|
||||
}
|
||||
|
||||
/* Output params. */
|
||||
in_out_v_count = emulated_v_count;
|
||||
in_out_primitive_type = GPU_PRIM_TRIS;
|
||||
return optimized_ibo_->get_metal_buffer();
|
||||
}
|
||||
return nil;
|
||||
}
|
||||
|
||||
void MTLIndexBuf::strip_restart_indices()
|
||||
{
|
||||
/* We remove point buffer primitive restart indices by swapping restart indices
|
||||
* with the first valid index at the end of the index buffer and reducing the
|
||||
* length. Primitive restarts are invalid in Metal for non-restart-compatible
|
||||
* primitive types. We also cannot just use zero unlike for Lines and Triangles,
|
||||
* as we cannot create de-generative point primitives to hide geometry, as each
|
||||
* point is indepednent.
|
||||
* Instead, we must remove these hidden indices from the index buffer.
|
||||
* Note: This happens prior to index squeezing so operate on 32-bit indices. */
|
||||
MutableSpan<uint32_t> uint_idx(static_cast<uint32_t *>(data_), index_len_);
|
||||
for (uint i = 0; i < index_len_; i++) {
|
||||
if (uint_idx[i] == 0xFFFFFFFFu) {
|
||||
|
||||
/* Find swap index at end of index buffer. */
|
||||
int swap_index = -1;
|
||||
for (uint j = index_len_ - 1; j >= i; j--) {
|
||||
/* If end index is restart, just reduce length. */
|
||||
if (uint_idx[j] == 0xFFFFFFFFu) {
|
||||
index_len_--;
|
||||
continue;
|
||||
}
|
||||
/* Otherwise assign swap index. */
|
||||
swap_index = j;
|
||||
break;
|
||||
}
|
||||
|
||||
/* If swap index is not valid, then there were no valid non-restart indices
|
||||
* to swap with. However, the above loop will have removed these indices by
|
||||
* reducing the length of indices. Debug assertions verify that the restart
|
||||
* index is no longer included. */
|
||||
if (swap_index == -1) {
|
||||
BLI_assert(index_len_ <= i);
|
||||
}
|
||||
else {
|
||||
/* If we have found an index we can swap with, flip the values.
|
||||
* We also reduce the length. As per above loop, swap_index should
|
||||
* now be outside the index length range. */
|
||||
uint32_t swap_index_value = uint_idx[swap_index];
|
||||
uint_idx[i] = swap_index_value;
|
||||
uint_idx[swap_index] = 0xFFFFFFFFu;
|
||||
index_len_--;
|
||||
BLI_assert(index_len_ <= swap_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
/* Flag as having been stripped to ensure invalid usage is tracked. */
|
||||
point_restarts_stripped_ = true;
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \} */
|
||||
|
||||
} // blender::gpu
|
@@ -25,7 +25,7 @@ class MTLQueryPool : public QueryPool {
|
||||
MTLVisibilityResultMode mtl_type_;
|
||||
Vector<gpu::MTLBuffer *> buffer_;
|
||||
|
||||
void allocate_buffer();
|
||||
void allocate();
|
||||
|
||||
public:
|
||||
MTLQueryPool();
|
||||
|
@@ -16,7 +16,7 @@ static const size_t VISIBILITY_RESULT_SIZE_IN_BYTES = 8;
|
||||
|
||||
MTLQueryPool::MTLQueryPool()
|
||||
{
|
||||
allocate_buffer();
|
||||
allocate();
|
||||
}
|
||||
MTLQueryPool::~MTLQueryPool()
|
||||
{
|
||||
@@ -26,7 +26,7 @@ MTLQueryPool::~MTLQueryPool()
|
||||
}
|
||||
}
|
||||
|
||||
void MTLQueryPool::allocate_buffer()
|
||||
void MTLQueryPool::allocate()
|
||||
{
|
||||
/* Allocate Metal buffer for visibility results. */
|
||||
size_t buffer_size_in_bytes = VISIBILITY_COUNT_PER_BUFFER * VISIBILITY_RESULT_SIZE_IN_BYTES;
|
||||
@@ -62,7 +62,7 @@ void MTLQueryPool::begin_query()
|
||||
int query_id = query_issued_;
|
||||
int requested_buffer = query_id / VISIBILITY_COUNT_PER_BUFFER;
|
||||
if (requested_buffer >= buffer_.size()) {
|
||||
allocate_buffer();
|
||||
allocate();
|
||||
}
|
||||
|
||||
BLI_assert(requested_buffer < buffer_.size());
|
||||
|
@@ -53,6 +53,10 @@ class GLIndexBuf : public IndexBuf {
|
||||
|
||||
private:
|
||||
bool is_active() const;
|
||||
void strip_restart_indices() override
|
||||
{
|
||||
/* No-op. */
|
||||
}
|
||||
|
||||
MEM_CXX_CLASS_ALLOC_FUNCS("GLIndexBuf")
|
||||
};
|
||||
|
Reference in New Issue
Block a user