Cycles: add NanoVDB support for Metal on Apple Silicon #104837

Closed
Brecht Van Lommel wants to merge 2 commits from brecht:cycles-nanovdb-metal into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
9 changed files with 8161 additions and 53 deletions

View File

@ -44,13 +44,21 @@ set(OPENVDB_EXTRA_ARGS
# -DLLVM_DIR=${LIBDIR}/llvm/lib/cmake/llvm
)
set(OPENVDB_PATCH ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb.diff)
if(APPLE)
set(OPENVDB_PATCH
${OPENVDB_PATCH} &&
${PATCH_CMD} -p 0 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb_metal.diff
)
endif()
ExternalProject_Add(openvdb
URL file://${PACKAGE_DIR}/${OPENVDB_FILE}
DOWNLOAD_DIR ${DOWNLOAD_DIR}
URL_HASH ${OPENVDB_HASH_TYPE}=${OPENVDB_HASH}
CMAKE_GENERATOR ${PLATFORM_ALT_GENERATOR}
PREFIX ${BUILD_DIR}/openvdb
PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/openvdb/src/openvdb < ${PATCH_DIR}/openvdb.diff
PATCH_COMMAND ${OPENVDB_PATCH}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/openvdb ${DEFAULT_CMAKE_FLAGS} ${OPENVDB_EXTRA_ARGS}
INSTALL_DIR ${LIBDIR}/openvdb
)

File diff suppressed because it is too large Load Diff

View File

@ -55,9 +55,8 @@ void device_metal_info(vector<DeviceInfo> &devices)
info.denoisers = DENOISER_NONE;
info.id = id;
if (MetalInfo::get_device_vendor(device) == METAL_GPU_AMD) {
info.has_light_tree = false;
}
info.has_nanovdb = MetalInfo::get_device_vendor(device) == METAL_GPU_APPLE;
info.has_light_tree = MetalInfo::get_device_vendor(device) != METAL_GPU_AMD;
devices.push_back(info);
device_index++;

View File

@ -67,9 +67,12 @@ class MetalDevice : public Device {
std::recursive_mutex metal_mem_map_mutex;
/* Bindless Textures */
bool is_texture(const TextureInfo &tex);
device_vector<TextureInfo> texture_info;
bool need_texture_info;
id<MTLArgumentEncoder> mtlTextureArgEncoder = nil;
id<MTLArgumentEncoder> mtlBufferArgEncoder = nil;
id<MTLBuffer> buffer_bindings_1d = nil;
id<MTLBuffer> texture_bindings_2d = nil;
id<MTLBuffer> texture_bindings_3d = nil;
std::vector<id<MTLTexture>> texture_slot_map;

View File

@ -91,11 +91,6 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
}
}
texture_bindings_2d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
texture_bindings_3d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
switch (device_vendor) {
default:
break;
@ -156,6 +151,16 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
arg_desc_texture.dataType = MTLDataTypeTexture;
arg_desc_texture.access = MTLArgumentAccessReadOnly;
mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
MTLArgumentDescriptor *arg_desc_buffer = [[MTLArgumentDescriptor alloc] init];
arg_desc_buffer.dataType = MTLDataTypePointer;
arg_desc_buffer.access = MTLArgumentAccessReadOnly;
mtlBufferArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_buffer ]];
buffer_bindings_1d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
texture_bindings_2d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
texture_bindings_3d = [mtlDevice newBufferWithLength:8192 options:default_storage_mode];
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
texture_bindings_3d.allocatedSize);
/* command queue for non-tracing work on the GPU */
mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
@ -180,6 +185,8 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
arg_desc_tex.dataType = MTLDataTypePointer;
arg_desc_tex.access = MTLArgumentAccessReadOnly;
arg_desc_tex.index = index++;
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_buf_1d */
arg_desc_tex.index = index++;
[ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
arg_desc_tex.index = index++;
@ -249,22 +256,26 @@ MetalDevice::~MetalDevice()
* existing_devices_mutex). */
thread_scoped_lock lock(existing_devices_mutex);
for (auto &tex : texture_slot_map) {
if (tex) {
[tex release];
tex = nil;
int num_resources = texture_info.size();
for (int res = 0; res < num_resources; res++) {
if (is_texture(texture_info[res])) {
[texture_slot_map[res] release];
texture_slot_map[res] = nil;
}
}
flush_delayed_free_list();
if (texture_bindings_2d) {
stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
texture_bindings_3d.allocatedSize);
[buffer_bindings_1d release];
[texture_bindings_2d release];
[texture_bindings_3d release];
}
[mtlTextureArgEncoder release];
[mtlBufferKernelParamsEncoder release];
[mtlBufferArgEncoder release];
[mtlASArgEncoder release];
[mtlAncillaryArgEncoder release];
[mtlGeneralCommandQueue release];
@ -328,6 +339,9 @@ void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_feat
break;
case METAL_GPU_APPLE:
global_defines += "#define __KERNEL_METAL_APPLE__\n";
# ifdef WITH_NANOVDB
global_defines += "#define WITH_NANOVDB\n";
# endif
break;
}
@ -542,6 +556,11 @@ void MetalDevice::compile_and_load(int device_id, MetalPipelineType pso_type)
}
}
bool MetalDevice::is_texture(const TextureInfo &tex)
{
return (tex.depth > 0 || tex.height > 0);
}
void MetalDevice::load_texture_info()
{
if (need_texture_info) {
@ -553,21 +572,20 @@ void MetalDevice::load_texture_info()
for (int tex = 0; tex < num_textures; tex++) {
uint64_t offset = tex * sizeof(void *);
id<MTLTexture> metal_texture = texture_slot_map[tex];
if (!metal_texture) {
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
[mtlTextureArgEncoder setTexture:nil atIndex:0];
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
[mtlTextureArgEncoder setTexture:nil atIndex:0];
}
else {
if (is_texture(texture_info[tex]) && texture_slot_map[tex]) {
id<MTLTexture> metal_texture = texture_slot_map[tex];
MTLTextureType type = metal_texture.textureType;
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
[mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
[mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
}
else {
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
[mtlTextureArgEncoder setTexture:nil atIndex:0];
[mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
[mtlTextureArgEncoder setTexture:nil atIndex:0];
}
}
if (default_storage_mode == MTLResourceStorageModeManaged) {
[texture_bindings_2d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
@ -740,7 +758,6 @@ void MetalDevice::generic_free(device_memory &mem)
mem.shared_pointer = 0;
/* Free device memory. */
delayed_free_list.push_back(mmem.mtlBuffer);
mmem.mtlBuffer = nil;
}
@ -975,7 +992,7 @@ void MetalDevice::global_free(device_memory &mem)
void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
{
generic_alloc(mem);
MetalDevice::MetalMem *mmem = generic_alloc(mem);
generic_copy_to(mem);
/* Resize once */
@ -984,27 +1001,32 @@ void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
/* Allocate some slots in advance, to reduce amount
* of re-allocations. */
texture_info.resize(round_up(slot + 1, 128));
texture_slot_map.resize(round_up(slot + 1, 128));
}
mem.info.data = (uint64_t)mem.device_pointer;
/* Set Mapping and tag that we need to (re-)upload to device */
texture_info[slot] = mem.info;
uint64_t offset = slot * sizeof(void *);
[mtlBufferArgEncoder setArgumentBuffer:buffer_bindings_1d offset:offset];
[mtlBufferArgEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
texture_info[slot].data = *(uint64_t *)((uint64_t)buffer_bindings_1d.contents + offset);
texture_slot_map[slot] = nil;
need_texture_info = true;
}
void MetalDevice::tex_alloc(device_texture &mem)
{
/* Check that dimensions fit within maximum allowable size.
* If 1D texture is allocated, use 1D buffer.
* See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */
if (mem.data_width > 16384 || mem.data_height > 16384) {
set_error(string_printf(
"Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
mem.data_width,
mem.data_height));
return;
if (mem.data_height > 0) {
if (mem.data_width > 16384 || mem.data_height > 16384) {
set_error(string_printf(
"Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
mem.data_width,
mem.data_height));
return;
}
}
MTLStorageMode storage_mode = MTLStorageModeManaged;
if (@available(macos 10.15, *)) {
if ([mtlDevice hasUnifiedMemory] &&
@ -1144,8 +1166,9 @@ void MetalDevice::tex_alloc(device_texture &mem)
bytesPerRow:src_pitch];
}
else {
assert(0);
/* 1D texture, using linear memory. */
tex_alloc_as_buffer(mem);
return;
}
mem.device_pointer = (device_ptr)mtlTexture;
@ -1169,17 +1192,22 @@ void MetalDevice::tex_alloc(device_texture &mem)
ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
if (texture_bindings_2d) {
delayed_free_list.push_back(buffer_bindings_1d);
delayed_free_list.push_back(texture_bindings_2d);
delayed_free_list.push_back(texture_bindings_3d);
stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
stats.mem_free(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
texture_bindings_3d.allocatedSize);
}
buffer_bindings_1d = [mtlDevice newBufferWithLength:min_buffer_length
options:default_storage_mode];
texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
options:default_storage_mode];
texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
options:default_storage_mode];
stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
stats.mem_alloc(buffer_bindings_1d.allocatedSize + texture_bindings_2d.allocatedSize +
texture_bindings_3d.allocatedSize);
}
}
@ -1206,12 +1234,18 @@ void MetalDevice::tex_alloc(device_texture &mem)
void MetalDevice::tex_free(device_texture &mem)
{
if (mem.data_depth == 0 && mem.data_height == 0) {
generic_free(mem);
return;
}
if (metal_mem_map.count(&mem)) {
std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
MetalMem &mmem = *metal_mem_map.at(&mem);
assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
texture_slot_map[mem.slot] = nil;
if (texture_slot_map[mem.slot] == mmem.mtlTexture)
texture_slot_map[mem.slot] = nil;
if (mmem.mtlTexture) {
/* Free bindless texture. */

View File

@ -477,17 +477,21 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_3d
offset:0
atIndex:1];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->buffer_bindings_1d
offset:0
atIndex:2];
if (@available(macos 12.0, *)) {
if (metal_device_->use_metalrt) {
if (metal_device_->bvhMetalRT) {
id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:3];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
offset:0
atIndex:7];
atIndex:8];
[metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
offset:0
atIndex:8];
atIndex:9];
}
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
@ -497,13 +501,13 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
atIndex:1];
[metal_device_->mtlAncillaryArgEncoder
setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table]
atIndex:3 + table];
atIndex:4 + table];
[mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table]
usage:MTLResourceUsageRead];
}
else {
[metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
atIndex:3 + table];
atIndex:4 + table];
}
}
}
@ -874,6 +878,7 @@ void MetalDeviceQueue::prepare_resources(DeviceKernel kernel)
/* ancillaries */
[mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead];
[mtlComputeEncoder_ useResource:metal_device_->texture_bindings_3d usage:MTLResourceUsageRead];
[mtlComputeEncoder_ useResource:metal_device_->buffer_bindings_1d usage:MTLResourceUsageRead];
}
id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)

View File

@ -5,13 +5,14 @@
CCL_NAMESPACE_BEGIN
#ifdef WITH_NANOVDB
# define NDEBUG /* Disable "assert" in device code */
# define NANOVDB_USE_INTRINSICS
# include "nanovdb/NanoVDB.h"
# include "nanovdb/util/SampleFromVoxels.h"
#if !defined __KERNEL_METAL__
# ifdef WITH_NANOVDB
# define NDEBUG /* Disable "assert" in device code */
# define NANOVDB_USE_INTRINSICS
# include "nanovdb/NanoVDB.h"
# include "nanovdb/util/SampleFromVoxels.h"
# endif
#endif
/* w0, w1, w2, and w3 are the four cubic B-spline basis functions. */
ccl_device float cubic_w0(float a)
{
@ -126,7 +127,7 @@ kernel_tex_image_interp_tricubic(ccl_global const TextureInfo &info, float x, fl
#ifdef WITH_NANOVDB
template<typename T, typename S>
ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tricubic_nanovdb(
S &s, float x, float y, float z)
__global__ S &s, float x, float y, float z)
{
float px = floorf(x);
float py = floorf(y);
@ -157,13 +158,52 @@ ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tric
g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1))));
}
template<typename T, typename S>
ccl_device typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_tricubic_nanovdb(
__local__ S &s, float x, float y, float z)
{
float px = floorf(x);
float py = floorf(y);
float pz = floorf(z);
float fx = x - px;
float fy = y - py;
float fz = z - pz;
float g0x = cubic_g0(fx);
float g1x = cubic_g1(fx);
float g0y = cubic_g0(fy);
float g1y = cubic_g1(fy);
float g0z = cubic_g0(fz);
float g1z = cubic_g1(fz);
float x0 = px + cubic_h0(fx);
float x1 = px + cubic_h1(fx);
float y0 = py + cubic_h0(fy);
float y1 = py + cubic_h1(fy);
float z0 = pz + cubic_h0(fz);
float z1 = pz + cubic_h1(fz);
using namespace nanovdb;
return g0z * (g0y * (g0x * s(Vec3f(x0, y0, z0)) + g1x * s(Vec3f(x1, y0, z0))) +
g1y * (g0x * s(Vec3f(x0, y1, z0)) + g1x * s(Vec3f(x1, y1, z0)))) +
g1z * (g0y * (g0x * s(Vec3f(x0, y0, z1)) + g1x * s(Vec3f(x1, y0, z1))) +
g1y * (g0x * s(Vec3f(x0, y1, z1)) + g1x * s(Vec3f(x1, y1, z1))));
}
# if defined(__KERNEL_METAL__)
template<typename T>
__attribute__((noinline)) typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_nanovdb(
ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
# else
template<typename T>
ccl_device_noinline typename nanovdb::NanoGrid<T>::ValueType kernel_tex_image_interp_nanovdb(
ccl_global const TextureInfo &info, float x, float y, float z, uint interpolation)
# endif
{
using namespace nanovdb;
NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
ccl_global NanoGrid<T> *const grid = (ccl_global NanoGrid<T> *)info.data;
typedef typename nanovdb::NanoGrid<T>::AccessorType AccessorType;
AccessorType acc = grid->getAccessor();

View File

@ -290,6 +290,10 @@ typedef metal::raytracing::intersector<triangle_data> metalrt_blas_intersector_t
/* texture bindings and sampler setup */
struct Buffer1DParamsMetal {
device float *buf;
};
struct Texture2DParamsMetal {
texture2d<float, access::sample> tex;
};
@ -306,6 +310,7 @@ struct MetalRTBlasWrapper {
struct MetalAncillaries {
device Texture2DParamsMetal *textures_2d;
device Texture3DParamsMetal *textures_3d;
device Buffer1DParamsMetal *buffers;
#ifdef __METALRT__
metalrt_as_type accel_struct;

View File

@ -3,6 +3,13 @@
// clang-format off
#ifdef WITH_NANOVDB
# define NDEBUG /* Disable "assert" in device code */
# define NANOVDB_USE_INTRINSICS
# include "nanovdb/NanoVDB.h"
# include "nanovdb/util/SampleFromVoxels.h"
#endif
/* Open the Metal kernel context class
* Necessary to access resource bindings */
class MetalKernelContext {