Cycles: Metal host-side code

This patch adds the Metal host-side code:

- Add all core host-side Metal backend files (device_impl, queue, etc)
- Add MetalRT BVH setup files
- Integrate with Cycles device enumeration code
- Revive `path_source_replace_includes` in util/path (required for MSL compilation)

This patch also includes a couple of small kernel-side fixes:

- Add an implementation of `lgammaf` for Metal [Nemes, Gergő (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik](https://users.renyi.hu/~gergonemes/)
- include "work_stealing.h" inside the Metal context class because it accesses state now

Ref T92212

Reviewed By: brecht

Maniphest Tasks: T92212

Differential Revision: https://developer.blender.org/D13423
This commit is contained in:
2021-12-07 15:11:35 +00:00
parent 565b33c0ad
commit 9558fa5196
34 changed files with 4355 additions and 32 deletions

View File

@@ -101,6 +101,11 @@ add_definitions(${GL_DEFINITIONS})
if(WITH_CYCLES_DEVICE_HIP)
add_definitions(-DWITH_HIP)
endif()
if(WITH_CYCLES_DEVICE_METAL)
add_definitions(-DWITH_METAL)
endif()
if(WITH_MOD_FLUID)
add_definitions(-DWITH_FLUID)
endif()

View File

@@ -28,7 +28,7 @@ def _configure_argument_parser():
action='store_true')
parser.add_argument("--cycles-device",
help="Set the device to use for Cycles, overriding user preferences and the scene setting."
"Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'"
"Valid options are 'CPU', 'CUDA', 'OPTIX', 'HIP' or 'METAL'."
"Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
default=None)
return parser

View File

@@ -111,7 +111,8 @@ enum_device_type = (
('CPU', "CPU", "CPU", 0),
('CUDA', "CUDA", "CUDA", 1),
('OPTIX', "OptiX", "OptiX", 3),
("HIP", "HIP", "HIP", 4)
('HIP', "HIP", "HIP", 4),
('METAL', "Metal", "Metal", 5)
)
enum_texture_limit = (
@@ -1312,8 +1313,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
def get_device_types(self, context):
import _cycles
has_cuda, has_optix, has_hip = _cycles.get_device_types()
has_cuda, has_optix, has_hip, has_metal = _cycles.get_device_types()
list = [('NONE', "None", "Don't use compute device", 0)]
if has_cuda:
list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
@@ -1321,6 +1321,8 @@ class CyclesPreferences(bpy.types.AddonPreferences):
list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
if has_hip:
list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4))
if has_metal:
list.append(('METAL', "Metal", "Use Metal for GPU acceleration", 5))
return list
@@ -1346,7 +1348,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
def update_device_entries(self, device_list):
for device in device_list:
if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}:
if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP', 'METAL'}:
continue
# Try to find existing Device entry
entry = self.find_existing_device_entry(device)
@@ -1390,7 +1392,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
import _cycles
# Ensure `self.devices` is not re-allocated when the second call to
# get_devices_for_type is made, freeing items from the first list.
for device_type in ('CUDA', 'OPTIX', 'HIP'):
for device_type in ('CUDA', 'OPTIX', 'HIP', 'METAL'):
self.update_device_entries(_cycles.available_devices(device_type))
# Deprecated: use refresh_devices instead.

View File

@@ -97,6 +97,11 @@ def use_cpu(context):
return (get_device_type(context) == 'NONE' or cscene.device == 'CPU')
def use_metal(context):
cscene = context.scene.cycles
return (get_device_type(context) == 'METAL' and cscene.device == 'GPU')
def use_cuda(context):
cscene = context.scene.cycles

View File

@@ -27,6 +27,7 @@ enum ComputeDevice {
COMPUTE_DEVICE_CUDA = 1,
COMPUTE_DEVICE_OPTIX = 3,
COMPUTE_DEVICE_HIP = 4,
COMPUTE_DEVICE_METAL = 5,
COMPUTE_DEVICE_NUM
};
@@ -85,6 +86,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
else if (compute_device == COMPUTE_DEVICE_HIP) {
mask |= DEVICE_MASK_HIP;
}
else if (compute_device == COMPUTE_DEVICE_METAL) {
mask |= DEVICE_MASK_METAL;
}
vector<DeviceInfo> devices = Device::available_devices(mask);
/* Match device preferences and available devices. */

View File

@@ -906,16 +906,18 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
{
vector<DeviceType> device_types = Device::available_types();
bool has_cuda = false, has_optix = false, has_hip = false;
bool has_cuda = false, has_optix = false, has_hip = false, has_metal = false;
foreach (DeviceType device_type, device_types) {
has_cuda |= (device_type == DEVICE_CUDA);
has_optix |= (device_type == DEVICE_OPTIX);
has_hip |= (device_type == DEVICE_HIP);
has_metal |= (device_type == DEVICE_METAL);
}
PyObject *list = PyTuple_New(3);
PyObject *list = PyTuple_New(4);
PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
PyTuple_SET_ITEM(list, 3, PyBool_FromLong(has_metal));
return list;
}
@@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
else if (override == "HIP") {
BlenderSession::device_override = DEVICE_MASK_HIP;
}
else if (override == "METAL") {
BlenderSession::device_override = DEVICE_MASK_METAL;
}
else {
printf("\nError: %s is not a valid Cycles device.\n", override.c_str());
Py_RETURN_FALSE;

View File

@@ -31,6 +31,7 @@ set(SRC
sort.cpp
split.cpp
unaligned.cpp
metal.mm
)
set(SRC_HEADERS
@@ -46,6 +47,7 @@ set(SRC_HEADERS
sort.h
split.h
unaligned.h
metal.h
)
set(LIB

View File

@@ -19,6 +19,7 @@
#include "bvh/bvh2.h"
#include "bvh/embree.h"
#include "bvh/metal.h"
#include "bvh/multi.h"
#include "bvh/optix.h"
@@ -105,13 +106,18 @@ BVH *BVH::create(const BVHParams &params,
#else
(void)device;
break;
#endif
case BVH_LAYOUT_METAL:
#ifdef WITH_METAL
return bvh_metal_create(params, geometry, objects, device);
#else
(void)device;
break;
#endif
case BVH_LAYOUT_MULTI_OPTIX:
case BVH_LAYOUT_MULTI_OPTIX_EMBREE:
case BVH_LAYOUT_MULTI_METAL_EMBREE:
return new BVHMulti(params, geometry, objects);
case BVH_LAYOUT_METAL:
/* host-side changes for BVH_LAYOUT_METAL are imminent */
case BVH_LAYOUT_NONE:
case BVH_LAYOUT_ALL:
break;

35
intern/cycles/bvh/metal.h Normal file
View File

@@ -0,0 +1,35 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __BVH_METAL_H__
#define __BVH_METAL_H__
#ifdef WITH_METAL
# include "bvh/bvh.h"
CCL_NAMESPACE_BEGIN
BVH *bvh_metal_create(const BVHParams &params,
const vector<Geometry *> &geometry,
const vector<Object *> &objects,
Device *device);
CCL_NAMESPACE_END
#endif /* WITH_METAL */
#endif /* __BVH_METAL_H__ */

View File

@@ -0,0 +1,33 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef WITH_METAL
# include "device/metal/bvh.h"
CCL_NAMESPACE_BEGIN
BVH *bvh_metal_create(const BVHParams &params,
const vector<Geometry *> &geometry,
const vector<Object *> &objects,
Device *device)
{
return new BVHMetal(params, geometry, objects, device);
}
CCL_NAMESPACE_END
#endif /* WITH_METAL */

View File

@@ -551,4 +551,18 @@ if(NOT WITH_HIP_DYNLOAD)
set(WITH_HIP_DYNLOAD ON)
endif()
###########################################################################
# Metal
###########################################################################
if(WITH_CYCLES_DEVICE_METAL)
FIND_LIBRARY(METAL_LIBRARY Metal)
if (METAL_LIBRARY)
message(STATUS "Found Metal: ${METAL_LIBRARY}")
else()
message(STATUS "Metal not found, disabling WITH_CYCLES_DEVICE_METAL")
set(WITH_CYCLES_DEVICE_METAL OFF)
endif()
endif()
unset(_cycles_lib_dir)

View File

@@ -43,7 +43,7 @@ if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
add_definitions(-DWITH_HIP_DYNLOAD)
endif()
set(SRC
set(SRC_BASE
device.cpp
denoise.cpp
graphics_interop.cpp
@@ -104,6 +104,21 @@ set(SRC_MULTI
multi/device.h
)
set(SRC_METAL
metal/bvh.mm
metal/bvh.h
metal/device.mm
metal/device.h
metal/device_impl.mm
metal/device_impl.h
metal/kernel.mm
metal/kernel.h
metal/queue.mm
metal/queue.h
metal/util.mm
metal/util.h
)
set(SRC_OPTIX
optix/device.cpp
optix/device.h
@@ -123,6 +138,17 @@ set(SRC_HEADERS
queue.h
)
set(SRC
${SRC_BASE}
${SRC_CPU}
${SRC_CUDA}
${SRC_HIP}
${SRC_DUMMY}
${SRC_MULTI}
${SRC_OPTIX}
${SRC_HEADERS}
)
set(LIB
cycles_kernel
cycles_util
@@ -158,6 +184,15 @@ endif()
if(WITH_CYCLES_DEVICE_OPTIX)
add_definitions(-DWITH_OPTIX)
endif()
if(WITH_CYCLES_DEVICE_METAL)
list(APPEND LIB
${METAL_LIBRARY}
)
add_definitions(-DWITH_METAL)
list(APPEND SRC
${SRC_METAL}
)
endif()
if(WITH_OPENIMAGEDENOISE)
list(APPEND LIB
@@ -168,20 +203,12 @@ endif()
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
cycles_add_library(cycles_device "${LIB}"
${SRC}
${SRC_CPU}
${SRC_CUDA}
${SRC_HIP}
${SRC_DUMMY}
${SRC_MULTI}
${SRC_OPTIX}
${SRC_HEADERS}
)
cycles_add_library(cycles_device "${LIB}" ${SRC})
source_group("cpu" FILES ${SRC_CPU})
source_group("cuda" FILES ${SRC_CUDA})
source_group("dummy" FILES ${SRC_DUMMY})
source_group("multi" FILES ${SRC_MULTI})
source_group("metal" FILES ${SRC_METAL})
source_group("optix" FILES ${SRC_OPTIX})
source_group("common" FILES ${SRC} ${SRC_HEADERS})

View File

@@ -27,6 +27,7 @@
#include "device/cuda/device.h"
#include "device/dummy/device.h"
#include "device/hip/device.h"
#include "device/metal/device.h"
#include "device/multi/device.h"
#include "device/optix/device.h"
@@ -49,6 +50,7 @@ vector<DeviceInfo> Device::cuda_devices;
vector<DeviceInfo> Device::optix_devices;
vector<DeviceInfo> Device::cpu_devices;
vector<DeviceInfo> Device::hip_devices;
vector<DeviceInfo> Device::metal_devices;
uint Device::devices_initialized_mask = 0;
/* Device */
@@ -105,6 +107,12 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
break;
#endif
#ifdef WITH_METAL
case DEVICE_METAL:
if (device_metal_init())
device = device_metal_create(info, stats, profiler);
break;
#endif
default:
break;
}
@@ -128,6 +136,8 @@ DeviceType Device::type_from_string(const char *name)
return DEVICE_MULTI;
else if (strcmp(name, "HIP") == 0)
return DEVICE_HIP;
else if (strcmp(name, "METAL") == 0)
return DEVICE_METAL;
return DEVICE_NONE;
}
@@ -144,6 +154,8 @@ string Device::string_from_type(DeviceType type)
return "MULTI";
else if (type == DEVICE_HIP)
return "HIP";
else if (type == DEVICE_METAL)
return "METAL";
return "";
}
@@ -161,7 +173,9 @@ vector<DeviceType> Device::available_types()
#ifdef WITH_HIP
types.push_back(DEVICE_HIP);
#endif
#ifdef WITH_METAL
types.push_back(DEVICE_METAL);
#endif
return types;
}
@@ -227,6 +241,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
}
}
#ifdef WITH_METAL
if (mask & DEVICE_MASK_METAL) {
if (!(devices_initialized_mask & DEVICE_MASK_METAL)) {
if (device_metal_init()) {
device_metal_info(metal_devices);
}
devices_initialized_mask |= DEVICE_MASK_METAL;
}
foreach (DeviceInfo &info, metal_devices) {
devices.push_back(info);
}
}
#endif
return devices;
}
@@ -266,6 +294,15 @@ string Device::device_capabilities(uint mask)
}
#endif
#ifdef WITH_METAL
if (mask & DEVICE_MASK_METAL) {
if (device_metal_init()) {
capabilities += "\nMetal device capabilities:\n";
capabilities += device_metal_capabilities();
}
}
#endif
return capabilities;
}
@@ -354,6 +391,7 @@ void Device::free_memory()
optix_devices.free_memory();
hip_devices.free_memory();
cpu_devices.free_memory();
metal_devices.free_memory();
}
unique_ptr<DeviceQueue> Device::gpu_queue_create()

View File

@@ -52,6 +52,7 @@ enum DeviceType {
DEVICE_MULTI,
DEVICE_OPTIX,
DEVICE_HIP,
DEVICE_METAL,
DEVICE_DUMMY,
};
@@ -60,6 +61,7 @@ enum DeviceTypeMask {
DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
DEVICE_MASK_HIP = (1 << DEVICE_HIP),
DEVICE_MASK_METAL = (1 << DEVICE_METAL),
DEVICE_MASK_ALL = ~0
};
@@ -281,6 +283,7 @@ class Device {
static vector<DeviceInfo> optix_devices;
static vector<DeviceInfo> cpu_devices;
static vector<DeviceInfo> hip_devices;
static vector<DeviceInfo> metal_devices;
static uint devices_initialized_mask;
};

View File

@@ -263,6 +263,7 @@ class device_memory {
friend class CUDADevice;
friend class OptiXDevice;
friend class HIPDevice;
friend class MetalDevice;
/* Only create through subclasses. */
device_memory(Device *device, const char *name, MemoryType type);

View File

@@ -0,0 +1,66 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef WITH_METAL
# include "bvh/bvh.h"
# include "bvh/params.h"
# include "device/memory.h"
# include <Metal/Metal.h>
CCL_NAMESPACE_BEGIN
class BVHMetal : public BVH {
public:
API_AVAILABLE(macos(11.0))
id<MTLAccelerationStructure> accel_struct = nil;
bool accel_struct_building = false;
API_AVAILABLE(macos(11.0))
vector<id<MTLAccelerationStructure>> blas_array;
bool motion_blur = false;
Stats &stats;
bool build(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
BVHMetal(const BVHParams &params,
const vector<Geometry *> &geometry,
const vector<Object *> &objects,
Device *device);
virtual ~BVHMetal();
bool build_BLAS(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
bool build_BLAS_mesh(Progress &progress,
id<MTLDevice> device,
id<MTLCommandQueue> queue,
Geometry *const geom,
bool refit);
bool build_BLAS_hair(Progress &progress,
id<MTLDevice> device,
id<MTLCommandQueue> queue,
Geometry *const geom,
bool refit);
bool build_TLAS(Progress &progress, id<MTLDevice> device, id<MTLCommandQueue> queue, bool refit);
};
CCL_NAMESPACE_END
#endif /* WITH_METAL */

View File

@@ -0,0 +1,813 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef WITH_METAL
# include "scene/hair.h"
# include "scene/mesh.h"
# include "scene/object.h"
# include "util/progress.h"
# include "device/metal/bvh.h"
CCL_NAMESPACE_BEGIN
# define BVH_status(...) \
{ \
string str = string_printf(__VA_ARGS__); \
progress.set_substatus(str); \
}
BVHMetal::BVHMetal(const BVHParams &params_,
const vector<Geometry *> &geometry_,
const vector<Object *> &objects_,
Device *device)
: BVH(params_, geometry_, objects_), stats(device->stats)
{
}
BVHMetal::~BVHMetal()
{
if (@available(macos 12.0, *)) {
if (accel_struct) {
stats.mem_free(accel_struct.allocatedSize);
[accel_struct release];
}
}
}
bool BVHMetal::build_BLAS_mesh(Progress &progress,
id<MTLDevice> device,
id<MTLCommandQueue> queue,
Geometry *const geom,
bool refit)
{
if (@available(macos 12.0, *)) {
/* Build BLAS for triangle primitives */
Mesh *const mesh = static_cast<Mesh *const>(geom);
if (mesh->num_triangles() == 0) {
return false;
}
/*------------------------------------------------*/
BVH_status(
"Building mesh BLAS | %7d tris | %s", (int)mesh->num_triangles(), geom->name.c_str());
/*------------------------------------------------*/
const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC);
const array<float3> &verts = mesh->get_verts();
const array<int> &tris = mesh->get_triangles();
const size_t num_verts = verts.size();
const size_t num_indices = tris.size();
size_t num_motion_steps = 1;
Attribute *motion_keys = mesh->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
num_motion_steps = mesh->get_motion_steps();
}
MTLResourceOptions storage_mode;
if (device.hasUnifiedMemory) {
storage_mode = MTLResourceStorageModeShared;
}
else {
storage_mode = MTLResourceStorageModeManaged;
}
/* Upload the mesh data to the GPU */
id<MTLBuffer> posBuf = nil;
id<MTLBuffer> indexBuf = [device newBufferWithBytes:tris.data()
length:num_indices * sizeof(tris.data()[0])
options:storage_mode];
if (num_motion_steps == 1) {
posBuf = [device newBufferWithBytes:verts.data()
length:num_verts * sizeof(verts.data()[0])
options:storage_mode];
}
else {
posBuf = [device newBufferWithLength:num_verts * num_motion_steps * sizeof(verts.data()[0])
options:storage_mode];
float3 *dest_data = (float3 *)[posBuf contents];
size_t center_step = (num_motion_steps - 1) / 2;
for (size_t step = 0; step < num_motion_steps; ++step) {
const float3 *verts = mesh->get_verts().data();
/* The center step for motion vertices is not stored in the attribute. */
if (step != center_step) {
verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
}
memcpy(dest_data + num_verts * step, verts, num_verts * sizeof(float3));
}
if (storage_mode == MTLResourceStorageModeManaged) {
[posBuf didModifyRange:NSMakeRange(0, posBuf.length)];
}
}
/* Create an acceleration structure. */
MTLAccelerationStructureGeometryDescriptor *geomDesc;
if (num_motion_steps > 1) {
std::vector<MTLMotionKeyframeData *> vertex_ptrs;
vertex_ptrs.reserve(num_motion_steps);
for (size_t step = 0; step < num_motion_steps; ++step) {
MTLMotionKeyframeData *k = [MTLMotionKeyframeData data];
k.buffer = posBuf;
k.offset = num_verts * step * sizeof(float3);
vertex_ptrs.push_back(k);
}
MTLAccelerationStructureMotionTriangleGeometryDescriptor *geomDescMotion =
[MTLAccelerationStructureMotionTriangleGeometryDescriptor descriptor];
geomDescMotion.vertexBuffers = [NSArray arrayWithObjects:vertex_ptrs.data()
count:vertex_ptrs.size()];
geomDescMotion.vertexStride = sizeof(verts.data()[0]);
geomDescMotion.indexBuffer = indexBuf;
geomDescMotion.indexBufferOffset = 0;
geomDescMotion.indexType = MTLIndexTypeUInt32;
geomDescMotion.triangleCount = num_indices / 3;
geomDescMotion.intersectionFunctionTableOffset = 0;
geomDesc = geomDescMotion;
}
else {
MTLAccelerationStructureTriangleGeometryDescriptor *geomDescNoMotion =
[MTLAccelerationStructureTriangleGeometryDescriptor descriptor];
geomDescNoMotion.vertexBuffer = posBuf;
geomDescNoMotion.vertexBufferOffset = 0;
geomDescNoMotion.vertexStride = sizeof(verts.data()[0]);
geomDescNoMotion.indexBuffer = indexBuf;
geomDescNoMotion.indexBufferOffset = 0;
geomDescNoMotion.indexType = MTLIndexTypeUInt32;
geomDescNoMotion.triangleCount = num_indices / 3;
geomDescNoMotion.intersectionFunctionTableOffset = 0;
geomDesc = geomDescNoMotion;
}
/* Force a single any-hit call, so shadow record-all behavior works correctly */
/* (Match optix behaviour: unsigned int build_flags =
* OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
geomDesc.allowDuplicateIntersectionFunctionInvocation = false;
MTLPrimitiveAccelerationStructureDescriptor *accelDesc =
[MTLPrimitiveAccelerationStructureDescriptor descriptor];
accelDesc.geometryDescriptors = @[ geomDesc ];
if (num_motion_steps > 1) {
accelDesc.motionStartTime = 0.0f;
accelDesc.motionEndTime = 1.0f;
accelDesc.motionStartBorderMode = MTLMotionBorderModeClamp;
accelDesc.motionEndBorderMode = MTLMotionBorderModeClamp;
accelDesc.motionKeyframeCount = num_motion_steps;
}
if (!use_fast_trace_bvh) {
accelDesc.usage |= (MTLAccelerationStructureUsageRefit |
MTLAccelerationStructureUsagePreferFastBuild);
}
MTLAccelerationStructureSizes accelSizes = [device
accelerationStructureSizesWithDescriptor:accelDesc];
id<MTLAccelerationStructure> accel_uncompressed = [device
newAccelerationStructureWithSize:accelSizes.accelerationStructureSize];
id<MTLBuffer> scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize
options:MTLResourceStorageModePrivate];
id<MTLBuffer> sizeBuf = [device newBufferWithLength:8 options:MTLResourceStorageModeShared];
id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
id<MTLAccelerationStructureCommandEncoder> accelEnc =
[accelCommands accelerationStructureCommandEncoder];
if (refit) {
[accelEnc refitAccelerationStructure:accel_struct
descriptor:accelDesc
destination:accel_uncompressed
scratchBuffer:scratchBuf
scratchBufferOffset:0];
}
else {
[accelEnc buildAccelerationStructure:accel_uncompressed
descriptor:accelDesc
scratchBuffer:scratchBuf
scratchBufferOffset:0];
}
if (use_fast_trace_bvh) {
[accelEnc writeCompactedAccelerationStructureSize:accel_uncompressed
toBuffer:sizeBuf
offset:0
sizeDataType:MTLDataTypeULong];
}
[accelEnc endEncoding];
[accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
/* free temp resources */
[scratchBuf release];
[indexBuf release];
[posBuf release];
if (use_fast_trace_bvh) {
/* Compact the accel structure */
uint64_t compressed_size = *(uint64_t *)sizeBuf.contents;
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
id<MTLAccelerationStructureCommandEncoder> accelEnc =
[accelCommands accelerationStructureCommandEncoder];
id<MTLAccelerationStructure> accel = [device
newAccelerationStructureWithSize:compressed_size];
[accelEnc copyAndCompactAccelerationStructure:accel_uncompressed
toAccelerationStructure:accel];
[accelEnc endEncoding];
[accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
uint64_t allocated_size = [accel allocatedSize];
stats.mem_alloc(allocated_size);
accel_struct = accel;
[accel_uncompressed release];
accel_struct_building = false;
}];
[accelCommands commit];
});
}
else {
/* set our acceleration structure to the uncompressed structure */
accel_struct = accel_uncompressed;
uint64_t allocated_size = [accel_struct allocatedSize];
stats.mem_alloc(allocated_size);
accel_struct_building = false;
}
[sizeBuf release];
}];
accel_struct_building = true;
[accelCommands commit];
return true;
}
return false;
}
bool BVHMetal::build_BLAS_hair(Progress &progress,
id<MTLDevice> device,
id<MTLCommandQueue> queue,
Geometry *const geom,
bool refit)
{
if (@available(macos 12.0, *)) {
/* Build BLAS for hair curves */
Hair *hair = static_cast<Hair *>(geom);
if (hair->num_curves() == 0) {
return false;
}
/*------------------------------------------------*/
BVH_status(
"Building hair BLAS | %7d curves | %s", (int)hair->num_curves(), geom->name.c_str());
/*------------------------------------------------*/
const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC);
const size_t num_segments = hair->num_segments();
size_t num_motion_steps = 1;
Attribute *motion_keys = hair->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
num_motion_steps = hair->get_motion_steps();
}
const size_t num_aabbs = num_segments * num_motion_steps;
MTLResourceOptions storage_mode;
if (device.hasUnifiedMemory) {
storage_mode = MTLResourceStorageModeShared;
}
else {
storage_mode = MTLResourceStorageModeManaged;
}
/* Allocate a GPU buffer for the AABB data and populate it */
id<MTLBuffer> aabbBuf = [device
newBufferWithLength:num_aabbs * sizeof(MTLAxisAlignedBoundingBox)
options:storage_mode];
MTLAxisAlignedBoundingBox *aabb_data = (MTLAxisAlignedBoundingBox *)[aabbBuf contents];
/* Get AABBs for each motion step */
size_t center_step = (num_motion_steps - 1) / 2;
for (size_t step = 0; step < num_motion_steps; ++step) {
/* The center step for motion vertices is not stored in the attribute */
const float3 *keys = hair->get_curve_keys().data();
if (step != center_step) {
size_t attr_offset = (step > center_step) ? step - 1 : step;
/* Technically this is a float4 array, but sizeof(float3) == sizeof(float4) */
keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
}
for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
const Hair::Curve curve = hair->get_curve(j);
for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
{
BoundBox bounds = BoundBox::empty;
curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
const size_t index = step * num_segments + i;
aabb_data[index].min = (MTLPackedFloat3 &)bounds.min;
aabb_data[index].max = (MTLPackedFloat3 &)bounds.max;
}
}
}
}
if (storage_mode == MTLResourceStorageModeManaged) {
[aabbBuf didModifyRange:NSMakeRange(0, aabbBuf.length)];
}
# if 0
for (size_t i=0; i<num_aabbs && i < 400; i++) {
MTLAxisAlignedBoundingBox& bb = aabb_data[i];
printf(" %d: %.1f,%.1f,%.1f -- %.1f,%.1f,%.1f\n", int(i), bb.min.x, bb.min.y, bb.min.z, bb.max.x, bb.max.y, bb.max.z);
}
# endif
MTLAccelerationStructureGeometryDescriptor *geomDesc;
if (motion_blur) {
std::vector<MTLMotionKeyframeData *> aabb_ptrs;
aabb_ptrs.reserve(num_motion_steps);
for (size_t step = 0; step < num_motion_steps; ++step) {
MTLMotionKeyframeData *k = [MTLMotionKeyframeData data];
k.buffer = aabbBuf;
k.offset = step * num_segments * sizeof(MTLAxisAlignedBoundingBox);
aabb_ptrs.push_back(k);
}
MTLAccelerationStructureMotionBoundingBoxGeometryDescriptor *geomDescMotion =
[MTLAccelerationStructureMotionBoundingBoxGeometryDescriptor descriptor];
geomDescMotion.boundingBoxBuffers = [NSArray arrayWithObjects:aabb_ptrs.data()
count:aabb_ptrs.size()];
geomDescMotion.boundingBoxCount = num_segments;
geomDescMotion.boundingBoxStride = sizeof(aabb_data[0]);
geomDescMotion.intersectionFunctionTableOffset = 1;
/* Force a single any-hit call, so shadow record-all behavior works correctly */
/* (Match optix behaviour: unsigned int build_flags =
* OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
geomDescMotion.allowDuplicateIntersectionFunctionInvocation = false;
geomDescMotion.opaque = true;
geomDesc = geomDescMotion;
}
else {
MTLAccelerationStructureBoundingBoxGeometryDescriptor *geomDescNoMotion =
[MTLAccelerationStructureBoundingBoxGeometryDescriptor descriptor];
geomDescNoMotion.boundingBoxBuffer = aabbBuf;
geomDescNoMotion.boundingBoxBufferOffset = 0;
geomDescNoMotion.boundingBoxCount = int(num_aabbs);
geomDescNoMotion.boundingBoxStride = sizeof(aabb_data[0]);
geomDescNoMotion.intersectionFunctionTableOffset = 1;
/* Force a single any-hit call, so shadow record-all behavior works correctly */
/* (Match optix behaviour: unsigned int build_flags =
* OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;) */
geomDescNoMotion.allowDuplicateIntersectionFunctionInvocation = false;
geomDescNoMotion.opaque = true;
geomDesc = geomDescNoMotion;
}
MTLPrimitiveAccelerationStructureDescriptor *accelDesc =
[MTLPrimitiveAccelerationStructureDescriptor descriptor];
accelDesc.geometryDescriptors = @[ geomDesc ];
if (motion_blur) {
accelDesc.motionStartTime = 0.0f;
accelDesc.motionEndTime = 1.0f;
accelDesc.motionStartBorderMode = MTLMotionBorderModeVanish;
accelDesc.motionEndBorderMode = MTLMotionBorderModeVanish;
accelDesc.motionKeyframeCount = num_motion_steps;
}
if (!use_fast_trace_bvh) {
accelDesc.usage |= (MTLAccelerationStructureUsageRefit |
MTLAccelerationStructureUsagePreferFastBuild);
}
MTLAccelerationStructureSizes accelSizes = [device
accelerationStructureSizesWithDescriptor:accelDesc];
id<MTLAccelerationStructure> accel_uncompressed = [device
newAccelerationStructureWithSize:accelSizes.accelerationStructureSize];
id<MTLBuffer> scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize
options:MTLResourceStorageModePrivate];
id<MTLBuffer> sizeBuf = [device newBufferWithLength:8 options:MTLResourceStorageModeShared];
id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
id<MTLAccelerationStructureCommandEncoder> accelEnc =
[accelCommands accelerationStructureCommandEncoder];
if (refit) {
[accelEnc refitAccelerationStructure:accel_struct
descriptor:accelDesc
destination:accel_uncompressed
scratchBuffer:scratchBuf
scratchBufferOffset:0];
}
else {
[accelEnc buildAccelerationStructure:accel_uncompressed
descriptor:accelDesc
scratchBuffer:scratchBuf
scratchBufferOffset:0];
}
if (use_fast_trace_bvh) {
[accelEnc writeCompactedAccelerationStructureSize:accel_uncompressed
toBuffer:sizeBuf
offset:0
sizeDataType:MTLDataTypeULong];
}
[accelEnc endEncoding];
[accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
/* free temp resources */
[scratchBuf release];
[aabbBuf release];
if (use_fast_trace_bvh) {
/* Compact the accel structure */
uint64_t compressed_size = *(uint64_t *)sizeBuf.contents;
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
id<MTLAccelerationStructureCommandEncoder> accelEnc =
[accelCommands accelerationStructureCommandEncoder];
id<MTLAccelerationStructure> accel = [device
newAccelerationStructureWithSize:compressed_size];
[accelEnc copyAndCompactAccelerationStructure:accel_uncompressed
toAccelerationStructure:accel];
[accelEnc endEncoding];
[accelCommands addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
uint64_t allocated_size = [accel allocatedSize];
stats.mem_alloc(allocated_size);
accel_struct = accel;
[accel_uncompressed release];
accel_struct_building = false;
}];
[accelCommands commit];
});
}
else {
/* set our acceleration structure to the uncompressed structure */
accel_struct = accel_uncompressed;
uint64_t allocated_size = [accel_struct allocatedSize];
stats.mem_alloc(allocated_size);
accel_struct_building = false;
}
[sizeBuf release];
}];
accel_struct_building = true;
[accelCommands commit];
return true;
}
return false;
}
bool BVHMetal::build_BLAS(Progress &progress,
id<MTLDevice> device,
id<MTLCommandQueue> queue,
bool refit)
{
if (@available(macos 12.0, *)) {
assert(objects.size() == 1 && geometry.size() == 1);
/* Build bottom level acceleration structures (BLAS) */
Geometry *const geom = geometry[0];
switch (geom->geometry_type) {
case Geometry::VOLUME:
case Geometry::MESH:
return build_BLAS_mesh(progress, device, queue, geom, refit);
case Geometry::HAIR:
return build_BLAS_hair(progress, device, queue, geom, refit);
default:
return false;
}
}
return false;
}
bool BVHMetal::build_TLAS(Progress &progress,
id<MTLDevice> device,
id<MTLCommandQueue> queue,
bool refit)
{
if (@available(macos 12.0, *)) {
/* we need to sync here and ensure that all BLAS have completed async generation by both GCD
* and Metal */
{
__block bool complete_bvh = false;
while (!complete_bvh) {
dispatch_sync(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
complete_bvh = true;
for (Object *ob : objects) {
/* Skip non-traceable objects */
if (!ob->is_traceable())
continue;
Geometry const *geom = ob->get_geometry();
BVHMetal const *blas = static_cast<BVHMetal const *>(geom->bvh);
if (blas->accel_struct_building) {
complete_bvh = false;
/* We're likely waiting on a command buffer that's in flight to complete.
* Queue up a command buffer and wait for it complete before checking the BLAS again
*/
id<MTLCommandBuffer> command_buffer = [queue commandBuffer];
[command_buffer commit];
[command_buffer waitUntilCompleted];
break;
}
}
});
}
}
uint32_t num_instances = 0;
uint32_t num_motion_transforms = 0;
for (Object *ob : objects) {
/* Skip non-traceable objects */
if (!ob->is_traceable())
continue;
num_instances++;
if (ob->use_motion()) {
num_motion_transforms += max(1, ob->get_motion().size());
}
else {
num_motion_transforms++;
}
}
/*------------------------------------------------*/
BVH_status("Building TLAS | %7d instances", (int)num_instances);
/*------------------------------------------------*/
const bool use_fast_trace_bvh = (params.bvh_type == BVH_TYPE_STATIC);
NSMutableArray *all_blas = [NSMutableArray array];
unordered_map<BVHMetal const *, int> instance_mapping;
/* Lambda function to build/retrieve the BLAS index mapping */
auto get_blas_index = [&](BVHMetal const *blas) {
auto it = instance_mapping.find(blas);
if (it != instance_mapping.end()) {
return it->second;
}
else {
int blas_index = (int)[all_blas count];
instance_mapping[blas] = blas_index;
if (@available(macos 12.0, *)) {
[all_blas addObject:blas->accel_struct];
}
return blas_index;
}
};
MTLResourceOptions storage_mode;
if (device.hasUnifiedMemory) {
storage_mode = MTLResourceStorageModeShared;
}
else {
storage_mode = MTLResourceStorageModeManaged;
}
size_t instance_size;
if (motion_blur) {
instance_size = sizeof(MTLAccelerationStructureMotionInstanceDescriptor);
}
else {
instance_size = sizeof(MTLAccelerationStructureUserIDInstanceDescriptor);
}
/* Allocate a GPU buffer for the instance data and populate it */
id<MTLBuffer> instanceBuf = [device newBufferWithLength:num_instances * instance_size
options:storage_mode];
id<MTLBuffer> motion_transforms_buf = nil;
MTLPackedFloat4x3 *motion_transforms = nullptr;
if (motion_blur && num_motion_transforms) {
motion_transforms_buf = [device
newBufferWithLength:num_motion_transforms * sizeof(MTLPackedFloat4x3)
options:storage_mode];
motion_transforms = (MTLPackedFloat4x3 *)motion_transforms_buf.contents;
}
uint32_t instance_index = 0;
uint32_t motion_transform_index = 0;
for (Object *ob : objects) {
/* Skip non-traceable objects */
if (!ob->is_traceable())
continue;
Geometry const *geom = ob->get_geometry();
BVHMetal const *blas = static_cast<BVHMetal const *>(geom->bvh);
uint32_t accel_struct_index = get_blas_index(blas);
/* Add some of the object visibility bits to the mask.
* __prim_visibility contains the combined visibility bits of all instances, so is not
* reliable if they differ between instances.
*
* METAL_WIP: OptiX visibility mask can only contain 8 bits, so have to trade-off here
* and select just a few important ones.
*/
uint32_t mask = ob->visibility_for_tracing() & 0xFF;
/* Have to have at least one bit in the mask, or else instance would always be culled. */
if (0 == mask) {
mask = 0xFF;
}
/* Set user instance ID to object index */
int object_index = ob->get_device_index();
uint32_t user_id = uint32_t(object_index);
/* Bake into the appropriate descriptor */
if (motion_blur) {
MTLAccelerationStructureMotionInstanceDescriptor *instances =
(MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;
desc.mask = mask;
desc.motionStartTime = 0.0f;
desc.motionEndTime = 1.0f;
desc.motionTransformsStartIndex = motion_transform_index;
desc.motionStartBorderMode = MTLMotionBorderModeVanish;
desc.motionEndBorderMode = MTLMotionBorderModeVanish;
desc.intersectionFunctionTableOffset = 0;
int key_count = ob->get_motion().size();
if (key_count) {
desc.motionTransformsCount = key_count;
Transform *keys = ob->get_motion().data();
for (int i = 0; i < key_count; i++) {
float *t = (float *)&motion_transforms[motion_transform_index++];
/* Transpose transform */
auto src = (float const *)&keys[i];
for (int i = 0; i < 12; i++) {
t[i] = src[(i / 3) + 4 * (i % 3)];
}
}
}
else {
desc.motionTransformsCount = 1;
float *t = (float *)&motion_transforms[motion_transform_index++];
if (ob->get_geometry()->is_instanced()) {
/* Transpose transform */
auto src = (float const *)&ob->get_tfm();
for (int i = 0; i < 12; i++) {
t[i] = src[(i / 3) + 4 * (i % 3)];
}
}
else {
/* Clear transform to identity matrix */
t[0] = t[4] = t[8] = 1.0f;
}
}
}
else {
MTLAccelerationStructureUserIDInstanceDescriptor *instances =
(MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;
desc.mask = mask;
desc.intersectionFunctionTableOffset = 0;
float *t = (float *)&desc.transformationMatrix;
if (ob->get_geometry()->is_instanced()) {
/* Transpose transform */
auto src = (float const *)&ob->get_tfm();
for (int i = 0; i < 12; i++) {
t[i] = src[(i / 3) + 4 * (i % 3)];
}
}
else {
/* Clear transform to identity matrix */
t[0] = t[4] = t[8] = 1.0f;
}
}
}
if (storage_mode == MTLResourceStorageModeManaged) {
[instanceBuf didModifyRange:NSMakeRange(0, instanceBuf.length)];
if (motion_transforms_buf) {
[motion_transforms_buf didModifyRange:NSMakeRange(0, motion_transforms_buf.length)];
assert(num_motion_transforms == motion_transform_index);
}
}
MTLInstanceAccelerationStructureDescriptor *accelDesc =
[MTLInstanceAccelerationStructureDescriptor descriptor];
accelDesc.instanceCount = num_instances;
accelDesc.instanceDescriptorType = MTLAccelerationStructureInstanceDescriptorTypeUserID;
accelDesc.instanceDescriptorBuffer = instanceBuf;
accelDesc.instanceDescriptorBufferOffset = 0;
accelDesc.instanceDescriptorStride = instance_size;
accelDesc.instancedAccelerationStructures = all_blas;
if (motion_blur) {
accelDesc.instanceDescriptorType = MTLAccelerationStructureInstanceDescriptorTypeMotion;
accelDesc.motionTransformBuffer = motion_transforms_buf;
accelDesc.motionTransformCount = num_motion_transforms;
}
if (!use_fast_trace_bvh) {
accelDesc.usage |= (MTLAccelerationStructureUsageRefit |
MTLAccelerationStructureUsagePreferFastBuild);
}
MTLAccelerationStructureSizes accelSizes = [device
accelerationStructureSizesWithDescriptor:accelDesc];
id<MTLAccelerationStructure> accel = [device
newAccelerationStructureWithSize:accelSizes.accelerationStructureSize];
id<MTLBuffer> scratchBuf = [device newBufferWithLength:accelSizes.buildScratchBufferSize
options:MTLResourceStorageModePrivate];
id<MTLCommandBuffer> accelCommands = [queue commandBuffer];
id<MTLAccelerationStructureCommandEncoder> accelEnc =
[accelCommands accelerationStructureCommandEncoder];
if (refit) {
[accelEnc refitAccelerationStructure:accel_struct
descriptor:accelDesc
destination:accel
scratchBuffer:scratchBuf
scratchBufferOffset:0];
}
else {
[accelEnc buildAccelerationStructure:accel
descriptor:accelDesc
scratchBuffer:scratchBuf
scratchBufferOffset:0];
}
[accelEnc endEncoding];
[accelCommands commit];
[accelCommands waitUntilCompleted];
if (motion_transforms_buf) {
[motion_transforms_buf release];
}
[instanceBuf release];
[scratchBuf release];
uint64_t allocated_size = [accel allocatedSize];
stats.mem_alloc(allocated_size);
/* Cache top and bottom-level acceleration structs */
accel_struct = accel;
blas_array.clear();
blas_array.reserve(all_blas.count);
for (id<MTLAccelerationStructure> blas in all_blas) {
blas_array.push_back(blas);
}
return true;
}
return false;
}
bool BVHMetal::build(Progress &progress,
id<MTLDevice> device,
id<MTLCommandQueue> queue,
bool refit)
{
if (@available(macos 12.0, *)) {
if (refit && params.bvh_type != BVH_TYPE_STATIC) {
assert(accel_struct);
}
else {
if (accel_struct) {
stats.mem_free(accel_struct.allocatedSize);
[accel_struct release];
accel_struct = nil;
}
}
}
if (!params.top_level) {
return build_BLAS(progress, device, queue, refit);
}
else {
return build_TLAS(progress, device, queue, refit);
}
}
CCL_NAMESPACE_END
#endif /* WITH_METAL */

View File

@@ -0,0 +1,37 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "util/string.h"
#include "util/vector.h"
CCL_NAMESPACE_BEGIN
class Device;
class DeviceInfo;
class Profiler;
class Stats;
bool device_metal_init();
Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
void device_metal_info(vector<DeviceInfo> &devices);
string device_metal_capabilities();
CCL_NAMESPACE_END

View File

@@ -0,0 +1,136 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef WITH_METAL
# include "device/metal/device.h"
# include "device/metal/device_impl.h"
#endif
#include "util/debug.h"
#include "util/set.h"
#include "util/system.h"
CCL_NAMESPACE_BEGIN
#ifdef WITH_METAL
Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
{
return new MetalDevice(info, stats, profiler);
}
bool device_metal_init()
{
return true;
}
static int device_metal_get_num_devices_safe(uint32_t *num_devices)
{
*num_devices = MTLCopyAllDevices().count;
return 0;
}
void device_metal_info(vector<DeviceInfo> &devices)
{
uint32_t num_devices = 0;
device_metal_get_num_devices_safe(&num_devices);
if (num_devices == 0) {
return;
}
vector<MetalPlatformDevice> usable_devices;
MetalInfo::get_usable_devices(&usable_devices);
/* Devices are numbered consecutively across platforms. */
set<string> unique_ids;
int device_index = 0;
for (MetalPlatformDevice &device : usable_devices) {
/* Compute unique ID for persistent user preferences. */
const string &device_name = device.device_name;
string id = string("METAL_") + device_name;
/* Hardware ID might not be unique, add device number in that case. */
if (unique_ids.find(id) != unique_ids.end()) {
id += string_printf("_ID_%d", num_devices);
}
unique_ids.insert(id);
/* Create DeviceInfo. */
DeviceInfo info;
info.type = DEVICE_METAL;
info.description = string_remove_trademark(string(device_name));
/* Ensure unique naming on Apple Silicon / SoC devices which return the same string for CPU and
* GPU */
if (info.description == system_cpu_brand_string()) {
info.description += " (GPU)";
}
info.num = device_index;
/* We don't know if it's used for display, but assume it is. */
info.display_device = true;
info.denoisers = DENOISER_NONE;
info.id = id;
devices.push_back(info);
device_index++;
}
}
string device_metal_capabilities()
{
string result = "";
string error_msg = "";
uint32_t num_devices = 0;
assert(device_metal_get_num_devices_safe(&num_devices));
if (num_devices == 0) {
return "No Metal devices found\n";
}
result += string_printf("Number of devices: %u\n", num_devices);
NSArray<id<MTLDevice>> *allDevices = MTLCopyAllDevices();
for (id<MTLDevice> device in allDevices) {
result += string_printf("\t\tDevice: %s\n", [device.name UTF8String]);
}
return result;
}
#else
Device *device_metal_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
{
return nullptr;
}
bool device_metal_init()
{
return false;
}
void device_metal_info(vector<DeviceInfo> &devices)
{
}
string device_metal_capabilities()
{
return "";
}
#endif
CCL_NAMESPACE_END

View File

@@ -0,0 +1,166 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef WITH_METAL
# include "bvh/bvh.h"
# include "device/device.h"
# include "device/metal/bvh.h"
# include "device/metal/device.h"
# include "device/metal/kernel.h"
# include "device/metal/queue.h"
# include "device/metal/util.h"
# include <Metal/Metal.h>
CCL_NAMESPACE_BEGIN
class DeviceQueue;
class MetalDevice : public Device {
public:
id<MTLDevice> mtlDevice = nil;
id<MTLLibrary> mtlLibrary[PSO_NUM] = {nil};
id<MTLArgumentEncoder> mtlBufferKernelParamsEncoder =
nil; /* encoder used for fetching device pointers from MTLBuffers */
id<MTLCommandQueue> mtlGeneralCommandQueue = nil;
id<MTLArgumentEncoder> mtlAncillaryArgEncoder =
nil; /* encoder used for fetching device pointers from MTLBuffers */
string source_used_for_compile[PSO_NUM];
KernelParamsMetal launch_params = {0};
/* MetalRT members ----------------------------------*/
BVHMetal *bvhMetalRT = nullptr;
bool motion_blur = false;
id<MTLArgumentEncoder> mtlASArgEncoder =
nil; /* encoder used for fetching device pointers from MTLAccelerationStructure */
/*---------------------------------------------------*/
string device_name;
MetalGPUVendor device_vendor;
uint kernel_features;
MTLResourceOptions default_storage_mode;
int max_threads_per_threadgroup;
int mtlDevId = 0;
bool first_error = true;
struct MetalMem {
device_memory *mem = nullptr;
int pointer_index = -1;
id<MTLBuffer> mtlBuffer = nil;
id<MTLTexture> mtlTexture = nil;
uint64_t offset = 0;
uint64_t size = 0;
void *hostPtr = nullptr;
bool use_UMA = false; /* If true, UMA memory in shared_pointer is being used. */
};
typedef map<device_memory *, unique_ptr<MetalMem>> MetalMemMap;
MetalMemMap metal_mem_map;
std::vector<id<MTLResource>> delayed_free_list;
std::recursive_mutex metal_mem_map_mutex;
/* Bindless Textures */
device_vector<TextureInfo> texture_info;
bool need_texture_info;
id<MTLArgumentEncoder> mtlTextureArgEncoder = nil;
id<MTLBuffer> texture_bindings_2d = nil;
id<MTLBuffer> texture_bindings_3d = nil;
std::vector<id<MTLTexture>> texture_slot_map;
MetalDeviceKernels kernels;
bool use_metalrt = false;
bool use_function_specialisation = false;
virtual BVHLayoutMask get_bvh_layout_mask() const override;
void set_error(const string &error) override;
MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
virtual ~MetalDevice();
bool support_device(const uint /*kernel_features*/);
bool check_peer_access(Device *peer_device) override;
bool use_adaptive_compilation();
string get_source(const uint kernel_features);
string compile_kernel(const uint kernel_features, const char *name);
virtual bool load_kernels(const uint kernel_features) override;
void reserve_local_memory(const uint kernel_features);
void init_host_memory();
void load_texture_info();
virtual bool should_use_graphics_interop() override;
virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
/* ------------------------------------------------------------------ */
/* low-level memory management */
MetalMem *generic_alloc(device_memory &mem);
void generic_copy_to(device_memory &mem);
void generic_free(device_memory &mem);
void mem_alloc(device_memory &mem) override;
void mem_copy_to(device_memory &mem) override;
void mem_copy_from(device_memory &mem)
{
mem_copy_from(mem, -1, -1, -1, -1);
}
void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
void mem_zero(device_memory &mem) override;
void mem_free(device_memory &mem) override;
device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
virtual void const_copy_to(const char *name, void *host, size_t size) override;
void global_alloc(device_memory &mem);
void global_free(device_memory &mem);
void tex_alloc(device_texture &mem);
void tex_alloc_as_buffer(device_texture &mem);
void tex_free(device_texture &mem);
void flush_delayed_free_list();
};
CCL_NAMESPACE_END
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,168 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef WITH_METAL
# include "device/kernel.h"
# include <Metal/Metal.h>
CCL_NAMESPACE_BEGIN
class MetalDevice;
enum {
METALRT_FUNC_DEFAULT_TRI,
METALRT_FUNC_DEFAULT_BOX,
METALRT_FUNC_SHADOW_TRI,
METALRT_FUNC_SHADOW_BOX,
METALRT_FUNC_LOCAL_TRI,
METALRT_FUNC_LOCAL_BOX,
METALRT_FUNC_CURVE_RIBBON,
METALRT_FUNC_CURVE_RIBBON_SHADOW,
METALRT_FUNC_CURVE_ALL,
METALRT_FUNC_CURVE_ALL_SHADOW,
METALRT_FUNC_NUM
};
enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
/* Pipeline State Object types */
enum {
/* A kernel that can be used with all scenes, supporting all features.
* It is slow to compile, but only needs to be compiled once and is then
* cached for future render sessions. This allows a render to get underway
* on the GPU quickly.
*/
PSO_GENERIC,
/* A kernel that is relatively quick to compile, but is specialised for the
* scene being rendered. It only contains the functionality and even baked in
* constants for values that means it needs to be recompiled whenever a
* dependent setting is changed. The render performance of this kernel is
* significantly faster though, and justifies the extra compile time.
*/
/* METAL_WIP: This isn't used and will require more changes to enable. */
PSO_SPECIALISED,
PSO_NUM
};
const char *kernel_type_as_string(int kernel_type);
struct MetalKernelPipeline {
void release()
{
if (pipeline) {
[pipeline release];
pipeline = nil;
if (@available(macOS 11.0, *)) {
for (int i = 0; i < METALRT_TABLE_NUM; i++) {
if (intersection_func_table[i]) {
[intersection_func_table[i] release];
intersection_func_table[i] = nil;
}
}
}
}
if (function) {
[function release];
function = nil;
}
if (@available(macOS 11.0, *)) {
for (int i = 0; i < METALRT_TABLE_NUM; i++) {
if (intersection_func_table[i]) {
[intersection_func_table[i] release];
}
}
}
}
bool loaded = false;
id<MTLFunction> function = nil;
id<MTLComputePipelineState> pipeline = nil;
API_AVAILABLE(macos(11.0))
id<MTLIntersectionFunctionTable> intersection_func_table[METALRT_TABLE_NUM] = {nil};
};
struct MetalKernelLoadDesc {
int pso_index = 0;
const char *function_name = nullptr;
int kernel_index = 0;
int threads_per_threadgroup = 0;
MTLFunctionConstantValues *constant_values = nullptr;
NSArray *linked_functions = nullptr;
struct IntersectorFunctions {
NSArray *defaults;
NSArray *shadow;
NSArray *local;
NSArray *operator[](int index) const
{
if (index == METALRT_TABLE_DEFAULT)
return defaults;
if (index == METALRT_TABLE_SHADOW)
return shadow;
return local;
}
} intersector_functions = {nullptr};
};
/* Metal kernel and associate occupancy information. */
class MetalDeviceKernel {
public:
~MetalDeviceKernel();
bool load(MetalDevice *device, MetalKernelLoadDesc const &desc, class MD5Hash const &md5);
void mark_loaded(int pso_index)
{
pso[pso_index].loaded = true;
}
int get_num_threads_per_block() const
{
return num_threads_per_block;
}
const MetalKernelPipeline &get_pso() const;
double load_duration = 0.0;
private:
MetalKernelPipeline pso[PSO_NUM];
int num_threads_per_block = 0;
};
/* Cache of Metal kernels for each DeviceKernel. */
class MetalDeviceKernels {
public:
bool load(MetalDevice *device, int kernel_type);
bool available(DeviceKernel kernel) const;
const MetalDeviceKernel &get(DeviceKernel kernel) const;
MetalDeviceKernel kernels_[DEVICE_KERNEL_NUM];
id<MTLFunction> rt_intersection_funcs[PSO_NUM][METALRT_FUNC_NUM] = {{nil}};
string loaded_md5[PSO_NUM];
};
CCL_NAMESPACE_END
#endif /* WITH_METAL */

View File

@@ -0,0 +1,523 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef WITH_METAL
# include "device/metal/kernel.h"
# include "device/metal/device_impl.h"
# include "util/md5.h"
# include "util/path.h"
# include "util/tbb.h"
# include "util/time.h"
CCL_NAMESPACE_BEGIN
/* limit to 2 MTLCompiler instances */
int max_mtlcompiler_threads = 2;
const char *kernel_type_as_string(int kernel_type)
{
switch (kernel_type) {
case PSO_GENERIC:
return "PSO_GENERIC";
case PSO_SPECIALISED:
return "PSO_SPECIALISED";
default:
assert(0);
}
return "";
}
MetalDeviceKernel::~MetalDeviceKernel()
{
for (int i = 0; i < PSO_NUM; i++) {
pso[i].release();
}
}
bool MetalDeviceKernel::load(MetalDevice *device,
MetalKernelLoadDesc const &desc_in,
MD5Hash const &md5)
{
__block MetalKernelLoadDesc const desc(desc_in);
if (desc.kernel_index == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
/* skip megakernel */
return true;
}
bool use_binary_archive = true;
if (getenv("CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
use_binary_archive = false;
}
id<MTLBinaryArchive> archive = nil;
string metalbin_path;
if (use_binary_archive) {
NSProcessInfo *processInfo = [NSProcessInfo processInfo];
string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
MD5Hash local_md5(md5);
local_md5.append(osVersion);
string metalbin_name = string(desc.function_name) + "." + local_md5.get_hex() +
to_string(desc.pso_index) + ".bin";
metalbin_path = path_cache_get(path_join("kernels", metalbin_name));
path_create_directories(metalbin_path);
if (path_exists(metalbin_path) && use_binary_archive) {
if (@available(macOS 11.0, *)) {
MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
[archiveDesc release];
}
}
}
NSString *entryPoint = [@(desc.function_name) copy];
NSError *error = NULL;
if (@available(macOS 11.0, *)) {
MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
func_desc.name = entryPoint;
if (desc.constant_values) {
func_desc.constantValues = desc.constant_values;
}
pso[desc.pso_index].function = [device->mtlLibrary[desc.pso_index]
newFunctionWithDescriptor:func_desc
error:&error];
}
[entryPoint release];
if (pso[desc.pso_index].function == nil) {
NSString *err = [error localizedDescription];
string errors = [err UTF8String];
device->set_error(
string_printf("Error getting function \"%s\": %s", desc.function_name, errors.c_str()));
return false;
}
pso[desc.pso_index].function.label = [@(desc.function_name) copy];
__block MTLComputePipelineDescriptor *computePipelineStateDescriptor =
[[MTLComputePipelineDescriptor alloc] init];
computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = desc.threads_per_threadgroup;
computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth = true;
computePipelineStateDescriptor.computeFunction = pso[desc.pso_index].function;
if (@available(macOS 11.0, *)) {
/* Attach the additional functions to an MTLLinkedFunctions object */
if (desc.linked_functions) {
computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc] init];
computePipelineStateDescriptor.linkedFunctions.functions = desc.linked_functions;
}
computePipelineStateDescriptor.maxCallStackDepth = 1;
}
/* Create a new Compute pipeline state object */
MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
bool creating_new_archive = false;
if (@available(macOS 11.0, *)) {
if (use_binary_archive) {
if (!archive) {
MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc] init];
archiveDesc.url = nil;
archive = [device->mtlDevice newBinaryArchiveWithDescriptor:archiveDesc error:nil];
creating_new_archive = true;
double starttime = time_dt();
if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
error:&error]) {
NSString *errStr = [error localizedDescription];
metal_printf("Failed to add PSO to archive:\n%s\n",
errStr ? [errStr UTF8String] : "nil");
}
else {
double duration = time_dt() - starttime;
metal_printf("%2d | %-55s | %7.2fs\n",
desc.kernel_index,
device_kernel_as_string((DeviceKernel)desc.kernel_index),
duration);
if (desc.pso_index == PSO_GENERIC) {
this->load_duration = duration;
}
}
}
computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
}
}
double starttime = time_dt();
MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
id<MTLComputePipelineState> computePipelineState,
MTLComputePipelineReflection *reflection,
NSError *error) {
bool recreate_archive = false;
if (computePipelineState == nil && archive && !creating_new_archive) {
assert(0);
NSString *errStr = [error localizedDescription];
metal_printf(
"Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
"(error: %s)\n",
device_kernel_as_string((DeviceKernel)desc.kernel_index),
errStr ? [errStr UTF8String] : "nil");
computePipelineState = [device->mtlDevice
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
options:MTLPipelineOptionNone
reflection:nullptr
error:&error];
recreate_archive = true;
}
double duration = time_dt() - starttime;
if (computePipelineState == nil) {
NSString *errStr = [error localizedDescription];
device->set_error(string_printf("Failed to create compute pipeline state \"%s\", error: \n",
device_kernel_as_string((DeviceKernel)desc.kernel_index)) +
(errStr ? [errStr UTF8String] : "nil"));
metal_printf("%2d | %-55s | %7.2fs | FAILED!\n",
desc.kernel_index,
device_kernel_as_string((DeviceKernel)desc.kernel_index),
duration);
return;
}
pso[desc.pso_index].pipeline = computePipelineState;
num_threads_per_block = round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
computePipelineState.threadExecutionWidth);
num_threads_per_block = std::max(num_threads_per_block,
(int)computePipelineState.threadExecutionWidth);
if (!use_binary_archive) {
metal_printf("%2d | %-55s | %7.2fs\n",
desc.kernel_index,
device_kernel_as_string((DeviceKernel)desc.kernel_index),
duration);
if (desc.pso_index == PSO_GENERIC) {
this->load_duration = duration;
}
}
if (@available(macOS 11.0, *)) {
if (creating_new_archive || recreate_archive) {
if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
error:&error]) {
metal_printf("Failed to save binary archive, error:\n%s\n",
[[error localizedDescription] UTF8String]);
}
}
}
[computePipelineStateDescriptor release];
computePipelineStateDescriptor = nil;
if (device->use_metalrt && desc.linked_functions) {
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
if (@available(macOS 11.0, *)) {
MTLIntersectionFunctionTableDescriptor *ift_desc =
[[MTLIntersectionFunctionTableDescriptor alloc] init];
ift_desc.functionCount = desc.intersector_functions[table].count;
pso[desc.pso_index].intersection_func_table[table] = [pso[desc.pso_index].pipeline
newIntersectionFunctionTableWithDescriptor:ift_desc];
/* Finally write the function handles into this pipeline's table */
for (int i = 0; i < 2; i++) {
id<MTLFunctionHandle> handle = [pso[desc.pso_index].pipeline
functionHandleWithFunction:desc.intersector_functions[table][i]];
[pso[desc.pso_index].intersection_func_table[table] setFunction:handle atIndex:i];
}
}
}
}
mark_loaded(desc.pso_index);
};
if (desc.pso_index == PSO_SPECIALISED) {
/* Asynchronous load */
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
NSError *error;
id<MTLComputePipelineState> pipeline = [device->mtlDevice
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
options:pipelineOptions
reflection:nullptr
error:&error];
completionHandler(pipeline, nullptr, error);
});
}
else {
/* Block on load to ensure we continue with a valid kernel function */
id<MTLComputePipelineState> pipeline = [device->mtlDevice
newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
options:pipelineOptions
reflection:nullptr
error:&error];
completionHandler(pipeline, nullptr, error);
}
return true;
}
const MetalKernelPipeline &MetalDeviceKernel::get_pso() const
{
if (pso[PSO_SPECIALISED].loaded) {
return pso[PSO_SPECIALISED];
}
assert(pso[PSO_GENERIC].loaded);
return pso[PSO_GENERIC];
}
bool MetalDeviceKernels::load(MetalDevice *device, int kernel_type)
{
bool any_error = false;
MD5Hash md5;
/* Build the function constant table */
MTLFunctionConstantValues *constant_values = nullptr;
if (kernel_type == PSO_SPECIALISED) {
constant_values = [MTLFunctionConstantValues new];
# define KERNEL_FILM(_type, name) \
[constant_values setConstantValue:&data.film.name \
type:get_MTLDataType_##_type() \
atIndex:KernelData_film_##name]; \
md5.append((uint8_t *)&data.film.name, sizeof(data.film.name));
# define KERNEL_BACKGROUND(_type, name) \
[constant_values setConstantValue:&data.background.name \
type:get_MTLDataType_##_type() \
atIndex:KernelData_background_##name]; \
md5.append((uint8_t *)&data.background.name, sizeof(data.background.name));
# define KERNEL_INTEGRATOR(_type, name) \
[constant_values setConstantValue:&data.integrator.name \
type:get_MTLDataType_##_type() \
atIndex:KernelData_integrator_##name]; \
md5.append((uint8_t *)&data.integrator.name, sizeof(data.integrator.name));
# define KERNEL_BVH(_type, name) \
[constant_values setConstantValue:&data.bvh.name \
type:get_MTLDataType_##_type() \
atIndex:KernelData_bvh_##name]; \
md5.append((uint8_t *)&data.bvh.name, sizeof(data.bvh.name));
/* METAL_WIP: populate constant_values based on KernelData */
assert(0);
/*
const KernelData &data = device->launch_params.data;
# include "kernel/types/background.h"
# include "kernel/types/bvh.h"
# include "kernel/types/film.h"
# include "kernel/types/integrator.h"
*/
}
if (device->use_metalrt) {
if (@available(macOS 11.0, *)) {
/* create the id<MTLFunction> for each intersection function */
const char *function_names[] = {
"__anyhit__cycles_metalrt_visibility_test_tri",
"__anyhit__cycles_metalrt_visibility_test_box",
"__anyhit__cycles_metalrt_shadow_all_hit_tri",
"__anyhit__cycles_metalrt_shadow_all_hit_box",
"__anyhit__cycles_metalrt_local_hit_tri",
"__anyhit__cycles_metalrt_local_hit_box",
"__intersection__curve_ribbon",
"__intersection__curve_ribbon_shadow",
"__intersection__curve_all",
"__intersection__curve_all_shadow",
};
assert(sizeof(function_names) / sizeof(function_names[0]) == METALRT_FUNC_NUM);
MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
if (kernel_type == PSO_SPECIALISED) {
desc.constantValues = constant_values;
}
for (int i = 0; i < METALRT_FUNC_NUM; i++) {
const char *function_name = function_names[i];
desc.name = [@(function_name) copy];
NSError *error = NULL;
rt_intersection_funcs[kernel_type][i] = [device->mtlLibrary[kernel_type]
newFunctionWithDescriptor:desc
error:&error];
if (rt_intersection_funcs[kernel_type][i] == nil) {
NSString *err = [error localizedDescription];
string errors = [err UTF8String];
device->set_error(string_printf(
"Error getting intersection function \"%s\": %s", function_name, errors.c_str()));
any_error = true;
break;
}
rt_intersection_funcs[kernel_type][i].label = [@(function_name) copy];
}
}
}
md5.append(device->source_used_for_compile[kernel_type]);
string hash = md5.get_hex();
if (loaded_md5[kernel_type] == hash) {
return true;
}
if (!any_error) {
NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
NSArray *function_list = nil;
if (device->use_metalrt) {
id<MTLFunction> box_intersect_default = nil;
id<MTLFunction> box_intersect_shadow = nil;
if (device->kernel_features & KERNEL_FEATURE_HAIR) {
/* Add curve intersection programs. */
if (device->kernel_features & KERNEL_FEATURE_HAIR_THICK) {
/* Slower programs for thick hair since that also slows down ribbons.
* Ideally this should not be needed. */
box_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL];
box_intersect_shadow = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_ALL_SHADOW];
}
else {
box_intersect_default = rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON];
box_intersect_shadow =
rt_intersection_funcs[kernel_type][METALRT_FUNC_CURVE_RIBBON_SHADOW];
}
}
table_functions[METALRT_TABLE_DEFAULT] = [NSArray
arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_TRI],
box_intersect_default ?
box_intersect_default :
rt_intersection_funcs[kernel_type][METALRT_FUNC_DEFAULT_BOX],
nil];
table_functions[METALRT_TABLE_SHADOW] = [NSArray
arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_TRI],
box_intersect_shadow ?
box_intersect_shadow :
rt_intersection_funcs[kernel_type][METALRT_FUNC_SHADOW_BOX],
nil];
table_functions[METALRT_TABLE_LOCAL] = [NSArray
arrayWithObjects:rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_TRI],
rt_intersection_funcs[kernel_type][METALRT_FUNC_LOCAL_BOX],
nil];
NSMutableSet *unique_functions = [NSMutableSet
setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
function_list = [[NSArray arrayWithArray:[unique_functions allObjects]]
sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
return [f1.label compare:f2.label];
}];
unique_functions = nil;
}
metal_printf("Starting %s \"cycles_metal_...\" pipeline builds\n",
kernel_type_as_string(kernel_type));
tbb::task_arena local_arena(max_mtlcompiler_threads);
local_arena.execute([&]() {
tbb::parallel_for(int(0), int(DEVICE_KERNEL_NUM), [&](int i) {
/* skip megakernel */
if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
return;
}
/* Only specialise kernels where it can make an impact */
if (kernel_type == PSO_SPECIALISED) {
if (i < DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
return;
}
}
MetalDeviceKernel &kernel = kernels_[i];
const std::string function_name = std::string("cycles_metal_") +
device_kernel_as_string((DeviceKernel)i);
int threads_per_threadgroup = device->max_threads_per_threadgroup;
if (i > DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL && i < DEVICE_KERNEL_INTEGRATOR_RESET) {
/* Always use 512 for the sorting kernels */
threads_per_threadgroup = 512;
}
NSArray *kernel_function_list = nil;
if (i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE ||
i == DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK ||
i == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE) {
kernel_function_list = function_list;
}
MetalKernelLoadDesc desc;
desc.pso_index = kernel_type;
desc.kernel_index = i;
desc.linked_functions = kernel_function_list;
desc.intersector_functions.defaults = table_functions[METALRT_TABLE_DEFAULT];
desc.intersector_functions.shadow = table_functions[METALRT_TABLE_SHADOW];
desc.intersector_functions.local = table_functions[METALRT_TABLE_LOCAL];
desc.constant_values = constant_values;
desc.threads_per_threadgroup = threads_per_threadgroup;
desc.function_name = function_name.c_str();
bool success = kernel.load(device, desc, md5);
any_error |= !success;
});
});
}
bool loaded = !any_error;
if (loaded) {
loaded_md5[kernel_type] = hash;
}
return loaded;
}
const MetalDeviceKernel &MetalDeviceKernels::get(DeviceKernel kernel) const
{
return kernels_[(int)kernel];
}
bool MetalDeviceKernels::available(DeviceKernel kernel) const
{
return kernels_[(int)kernel].get_pso().function != nil;
}
CCL_NAMESPACE_END
#endif /* WITH_METAL*/

View File

@@ -0,0 +1,97 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef WITH_METAL
# include "device/kernel.h"
# include "device/memory.h"
# include "device/queue.h"
# include "device/metal/util.h"
# include "kernel/device/metal/globals.h"
# define metal_printf VLOG(4) << string_printf
CCL_NAMESPACE_BEGIN
class MetalDevice;
/* Base class for Metal queues. */
class MetalDeviceQueue : public DeviceQueue {
public:
MetalDeviceQueue(MetalDevice *device);
~MetalDeviceQueue();
virtual int num_concurrent_states(const size_t) const override;
virtual int num_concurrent_busy_states() const override;
virtual void init_execution() override;
virtual bool enqueue(DeviceKernel kernel,
const int work_size,
DeviceKernelArguments const &args) override;
virtual bool synchronize() override;
virtual void zero_to_device(device_memory &mem) override;
virtual void copy_to_device(device_memory &mem) override;
virtual void copy_from_device(device_memory &mem) override;
virtual bool kernel_available(DeviceKernel kernel) const override;
protected:
void prepare_resources(DeviceKernel kernel);
id<MTLComputeCommandEncoder> get_compute_encoder(DeviceKernel kernel);
id<MTLBlitCommandEncoder> get_blit_encoder();
MetalDevice *metal_device;
MetalBufferPool temp_buffer_pool;
API_AVAILABLE(macos(11.0), ios(14.0))
MTLCommandBufferDescriptor *command_buffer_desc = nullptr;
id<MTLDevice> mtlDevice = nil;
id<MTLCommandQueue> mtlCommandQueue = nil;
id<MTLCommandBuffer> mtlCommandBuffer = nil;
id<MTLComputeCommandEncoder> mtlComputeEncoder = nil;
id<MTLBlitCommandEncoder> mtlBlitEncoder = nil;
id<MTLSharedEvent> shared_event = nil;
MTLSharedEventListener *shared_event_listener = nil;
dispatch_queue_t event_queue;
dispatch_semaphore_t wait_semaphore;
struct CopyBack {
void *host_pointer;
void *gpu_mem;
uint64_t size;
};
std::vector<CopyBack> copy_back_mem;
uint64_t shared_event_id;
uint64_t command_buffers_submitted = 0;
uint64_t command_buffers_completed = 0;
Stats &stats;
void close_compute_encoder();
void close_blit_encoder();
};
CCL_NAMESPACE_END
#endif /* WITH_METAL */

View File

@@ -0,0 +1,602 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef WITH_METAL
# include "device/metal/queue.h"
# include "device/metal/device_impl.h"
# include "device/metal/kernel.h"
# include "util/path.h"
# include "util/string.h"
# include "util/time.h"
CCL_NAMESPACE_BEGIN
/* MetalDeviceQueue */
MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
: DeviceQueue(device), metal_device(device), stats(device->stats)
{
if (@available(macos 11.0, *)) {
command_buffer_desc = [[MTLCommandBufferDescriptor alloc] init];
command_buffer_desc.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus;
}
mtlDevice = device->mtlDevice;
mtlCommandQueue = [mtlDevice newCommandQueue];
shared_event = [mtlDevice newSharedEvent];
shared_event_id = 1;
/* Shareable event listener */
event_queue = dispatch_queue_create("com.cycles.metal.event_queue", NULL);
shared_event_listener = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue];
wait_semaphore = dispatch_semaphore_create(0);
}
MetalDeviceQueue::~MetalDeviceQueue()
{
/* Tidying up here isn't really practical - we should expect and require the work
* queue to be empty here. */
assert(mtlCommandBuffer == nil);
assert(command_buffers_submitted == command_buffers_completed);
[shared_event_listener release];
[shared_event release];
if (@available(macos 11.0, *)) {
[command_buffer_desc release];
}
if (mtlCommandQueue) {
[mtlCommandQueue release];
mtlCommandQueue = nil;
}
}
int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
{
/* METAL_WIP */
/* TODO: compute automatically. */
/* TODO: must have at least num_threads_per_block. */
int result = 1048576;
if (metal_device->device_vendor == METAL_GPU_AMD) {
result *= 2;
}
else if (metal_device->device_vendor == METAL_GPU_APPLE) {
result *= 4;
}
return result;
}
int MetalDeviceQueue::num_concurrent_busy_states() const
{
/* METAL_WIP */
/* TODO: compute automatically. */
int result = 65536;
if (metal_device->device_vendor == METAL_GPU_AMD) {
result *= 2;
}
else if (metal_device->device_vendor == METAL_GPU_APPLE) {
result *= 4;
}
return result;
}
void MetalDeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
metal_device->load_texture_info();
synchronize();
}
bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
const int work_size,
DeviceKernelArguments const &args)
{
if (metal_device->have_error()) {
return false;
}
VLOG(3) << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
<< work_size;
const MetalDeviceKernel &metal_kernel = metal_device->kernels.get(kernel);
const MetalKernelPipeline &metal_kernel_pso = metal_kernel.get_pso();
id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
/* Determine size requirement for argument buffer. */
size_t arg_buffer_length = 0;
for (size_t i = 0; i < args.count; i++) {
size_t size_in_bytes = args.sizes[i];
arg_buffer_length = round_up(arg_buffer_length, size_in_bytes) + size_in_bytes;
}
/* 256 is the Metal offset alignment for constant address space bindings */
arg_buffer_length = round_up(arg_buffer_length, 256);
/* Globals placed after "vanilla" arguments. */
size_t globals_offsets = arg_buffer_length;
arg_buffer_length += sizeof(KernelParamsMetal);
arg_buffer_length = round_up(arg_buffer_length, 256);
/* Metal ancilliary bindless pointers */
size_t metal_offsets = arg_buffer_length;
arg_buffer_length += metal_device->mtlAncillaryArgEncoder.encodedLength;
arg_buffer_length = round_up(arg_buffer_length, metal_device->mtlAncillaryArgEncoder.alignment);
/* Temporary buffer used to prepare arg_buffer */
uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length);
memset(init_arg_buffer, 0, arg_buffer_length);
/* Prepare the non-pointer "enqueue" arguments */
size_t bytes_written = 0;
for (size_t i = 0; i < args.count; i++) {
size_t size_in_bytes = args.sizes[i];
bytes_written = round_up(bytes_written, size_in_bytes);
if (args.types[i] != DeviceKernelArguments::POINTER) {
memcpy(init_arg_buffer + bytes_written, args.values[i], size_in_bytes);
}
bytes_written += size_in_bytes;
}
/* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
/* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, __integrator_state) +
sizeof(IntegratorStateGPU);
size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
(uint8_t *)&metal_device->launch_params + plain_old_launch_data_offset,
plain_old_launch_data_size);
/* Allocate an argument buffer. */
MTLResourceOptions arg_buffer_options = MTLResourceStorageModeManaged;
if (@available(macOS 11.0, *)) {
if ([mtlDevice hasUnifiedMemory]) {
arg_buffer_options = MTLResourceStorageModeShared;
}
}
id<MTLBuffer> arg_buffer = temp_buffer_pool.get_buffer(
mtlDevice, mtlCommandBuffer, arg_buffer_length, arg_buffer_options, init_arg_buffer, stats);
/* Encode the pointer "enqueue" arguments */
bytes_written = 0;
for (size_t i = 0; i < args.count; i++) {
size_t size_in_bytes = args.sizes[i];
bytes_written = round_up(bytes_written, size_in_bytes);
if (args.types[i] == DeviceKernelArguments::POINTER) {
[metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
offset:bytes_written];
if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) {
[mtlComputeCommandEncoder useResource:mmem->mtlBuffer
usage:MTLResourceUsageRead | MTLResourceUsageWrite];
[metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
}
else {
if (@available(macos 12.0, *)) {
[metal_device->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0];
}
}
}
bytes_written += size_in_bytes;
}
/* Encode KernelParamsMetal buffers */
[metal_device->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer offset:globals_offsets];
/* this relies on IntegratorStateGPU layout being contiguous device_ptrs */
const size_t pointer_block_end = offsetof(KernelParamsMetal, __integrator_state) +
sizeof(IntegratorStateGPU);
for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
int pointer_index = offset / sizeof(device_ptr);
MetalDevice::MetalMem *mmem = *(
MetalDevice::MetalMem **)((uint8_t *)&metal_device->launch_params + offset);
if (mmem && (mmem->mtlBuffer || mmem->mtlTexture)) {
[metal_device->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
offset:0
atIndex:pointer_index];
}
else {
if (@available(macos 12.0, *)) {
[metal_device->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:pointer_index];
}
}
}
bytes_written = globals_offsets + sizeof(KernelParamsMetal);
/* Encode ancillaries */
[metal_device->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
[metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_2d
offset:0
atIndex:0];
[metal_device->mtlAncillaryArgEncoder setBuffer:metal_device->texture_bindings_3d
offset:0
atIndex:1];
if (@available(macos 12.0, *)) {
if (metal_device->use_metalrt) {
if (metal_device->bvhMetalRT) {
id<MTLAccelerationStructure> accel_struct = metal_device->bvhMetalRT->accel_struct;
[metal_device->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
}
for (int table = 0; table < METALRT_TABLE_NUM; table++) {
if (metal_kernel_pso.intersection_func_table[table]) {
[metal_kernel_pso.intersection_func_table[table] setBuffer:arg_buffer
offset:globals_offsets
atIndex:1];
[metal_device->mtlAncillaryArgEncoder
setIntersectionFunctionTable:metal_kernel_pso.intersection_func_table[table]
atIndex:3 + table];
[mtlComputeCommandEncoder useResource:metal_kernel_pso.intersection_func_table[table]
usage:MTLResourceUsageRead];
}
else {
[metal_device->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
atIndex:3 + table];
}
}
}
bytes_written = metal_offsets + metal_device->mtlAncillaryArgEncoder.encodedLength;
}
if (arg_buffer.storageMode == MTLStorageModeManaged) {
[arg_buffer didModifyRange:NSMakeRange(0, bytes_written)];
}
[mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0];
[mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1];
[mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2];
if (metal_device->use_metalrt) {
if (@available(macos 12.0, *)) {
auto bvhMetalRT = metal_device->bvhMetalRT;
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
break;
default:
bvhMetalRT = nil;
break;
}
if (bvhMetalRT) {
/* Mark all Accelerations resources as used */
[mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
[mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
count:bvhMetalRT->blas_array.size()
usage:MTLResourceUsageRead];
}
}
}
[mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso.pipeline];
/* Compute kernel launch parameters. */
const int num_threads_per_block = metal_kernel.get_num_threads_per_block();
int shared_mem_bytes = 0;
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
/* See parallel_active_index.h for why this amount of shared memory is needed.
* Rounded up to 16 bytes for Metal */
shared_mem_bytes = round_up((num_threads_per_block + 1) * sizeof(int), 16);
[mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
break;
default:
break;
}
MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
divide_up(work_size, num_threads_per_block), 1, 1);
MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
[mtlComputeCommandEncoder dispatchThreadgroups:size_threadgroups_per_dispatch
threadsPerThreadgroup:size_threads_per_threadgroup];
[mtlCommandBuffer addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
NSString *kernel_name = metal_kernel_pso.function.label;
/* Enhanced command buffer errors are only available in 11.0+ */
if (@available(macos 11.0, *)) {
if (command_buffer.status == MTLCommandBufferStatusError && command_buffer.error != nil) {
printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]);
NSArray<id<MTLCommandBufferEncoderInfo>> *encoderInfos = [command_buffer.error.userInfo
valueForKey:MTLCommandBufferEncoderInfoErrorKey];
if (encoderInfos != nil) {
for (id<MTLCommandBufferEncoderInfo> encoderInfo : encoderInfos) {
NSLog(@"%@", encoderInfo);
}
}
id<MTLLogContainer> logs = command_buffer.logs;
for (id<MTLFunctionLog> log in logs) {
NSLog(@"%@", log);
}
}
else if (command_buffer.error) {
printf("CommandBuffer Failed: %s\n", [kernel_name UTF8String]);
}
}
}];
return !(metal_device->have_error());
}
bool MetalDeviceQueue::synchronize()
{
if (metal_device->have_error()) {
return false;
}
if (mtlComputeEncoder) {
close_compute_encoder();
}
close_blit_encoder();
if (mtlCommandBuffer) {
uint64_t shared_event_id = this->shared_event_id++;
__block dispatch_semaphore_t block_sema = wait_semaphore;
[shared_event notifyListener:shared_event_listener
atValue:shared_event_id
block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) {
dispatch_semaphore_signal(block_sema);
}];
[mtlCommandBuffer encodeSignalEvent:shared_event value:shared_event_id];
[mtlCommandBuffer commit];
dispatch_semaphore_wait(wait_semaphore, DISPATCH_TIME_FOREVER);
[mtlCommandBuffer release];
for (const CopyBack &mmem : copy_back_mem) {
memcpy((uchar *)mmem.host_pointer, (uchar *)mmem.gpu_mem, mmem.size);
}
copy_back_mem.clear();
temp_buffer_pool.process_command_buffer_completion(mtlCommandBuffer);
metal_device->flush_delayed_free_list();
mtlCommandBuffer = nil;
}
return !(metal_device->have_error());
}
void MetalDeviceQueue::zero_to_device(device_memory &mem)
{
assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
if (mem.memory_size() == 0) {
return;
}
/* Allocate on demand. */
if (mem.device_pointer == 0) {
metal_device->mem_alloc(mem);
}
/* Zero memory on device. */
assert(mem.device_pointer != 0);
std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
MetalDevice::MetalMem &mmem = *metal_device->metal_mem_map.at(&mem);
if (mmem.mtlBuffer) {
id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
[blitEncoder fillBuffer:mmem.mtlBuffer range:NSMakeRange(mmem.offset, mmem.size) value:0];
}
else {
metal_device->mem_zero(mem);
}
}
void MetalDeviceQueue::copy_to_device(device_memory &mem)
{
if (mem.memory_size() == 0) {
return;
}
/* Allocate on demand. */
if (mem.device_pointer == 0) {
metal_device->mem_alloc(mem);
}
assert(mem.device_pointer != 0);
assert(mem.host_pointer != nullptr);
std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
auto result = metal_device->metal_mem_map.find(&mem);
if (result != metal_device->metal_mem_map.end()) {
if (mem.host_pointer == mem.shared_pointer) {
return;
}
MetalDevice::MetalMem &mmem = *result->second;
id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
id<MTLBuffer> buffer = temp_buffer_pool.get_buffer(mtlDevice,
mtlCommandBuffer,
mmem.size,
MTLResourceStorageModeShared,
mem.host_pointer,
stats);
[blitEncoder copyFromBuffer:buffer
sourceOffset:0
toBuffer:mmem.mtlBuffer
destinationOffset:mmem.offset
size:mmem.size];
}
else {
metal_device->mem_copy_to(mem);
}
}
void MetalDeviceQueue::copy_from_device(device_memory &mem)
{
assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
if (mem.memory_size() == 0) {
return;
}
assert(mem.device_pointer != 0);
assert(mem.host_pointer != nullptr);
std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
MetalDevice::MetalMem &mmem = *metal_device->metal_mem_map.at(&mem);
if (mmem.mtlBuffer) {
const size_t size = mem.memory_size();
if (mem.device_pointer) {
if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
[blitEncoder synchronizeResource:mmem.mtlBuffer];
}
if (mem.host_pointer != mmem.hostPtr) {
if (mtlCommandBuffer) {
copy_back_mem.push_back({mem.host_pointer, mmem.hostPtr, size});
}
else {
memcpy((uchar *)mem.host_pointer, (uchar *)mmem.hostPtr, size);
}
}
}
else {
memset((char *)mem.host_pointer, 0, size);
}
}
else {
metal_device->mem_copy_from(mem);
}
}
bool MetalDeviceQueue::kernel_available(DeviceKernel kernel) const
{
return metal_device->kernels.available(kernel);
}
void MetalDeviceQueue::prepare_resources(DeviceKernel kernel)
{
std::lock_guard<std::recursive_mutex> lock(metal_device->metal_mem_map_mutex);
/* declare resource usage */
for (auto &it : metal_device->metal_mem_map) {
device_memory *mem = it.first;
MTLResourceUsage usage = MTLResourceUsageRead;
if (mem->type != MEM_GLOBAL && mem->type != MEM_READ_ONLY && mem->type != MEM_TEXTURE) {
usage |= MTLResourceUsageWrite;
}
if (it.second->mtlBuffer) {
/* METAL_WIP - use array version (i.e. useResources) */
[mtlComputeEncoder useResource:it.second->mtlBuffer usage:usage];
}
else if (it.second->mtlTexture) {
/* METAL_WIP - use array version (i.e. useResources) */
[mtlComputeEncoder useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample];
}
}
/* ancillaries */
[mtlComputeEncoder useResource:metal_device->texture_bindings_2d usage:MTLResourceUsageRead];
[mtlComputeEncoder useResource:metal_device->texture_bindings_3d usage:MTLResourceUsageRead];
}
id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)
{
bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM);
if (mtlComputeEncoder) {
if (mtlComputeEncoder.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
MTLDispatchTypeSerial) {
/* declare usage of MTLBuffers etc */
prepare_resources(kernel);
return mtlComputeEncoder;
}
close_compute_encoder();
}
close_blit_encoder();
if (!mtlCommandBuffer) {
mtlCommandBuffer = [mtlCommandQueue commandBuffer];
[mtlCommandBuffer retain];
}
mtlComputeEncoder = [mtlCommandBuffer
computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
MTLDispatchTypeSerial];
/* declare usage of MTLBuffers etc */
prepare_resources(kernel);
return mtlComputeEncoder;
}
id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
{
if (mtlBlitEncoder) {
return mtlBlitEncoder;
}
if (mtlComputeEncoder) {
close_compute_encoder();
}
if (!mtlCommandBuffer) {
mtlCommandBuffer = [mtlCommandQueue commandBuffer];
[mtlCommandBuffer retain];
}
mtlBlitEncoder = [mtlCommandBuffer blitCommandEncoder];
return mtlBlitEncoder;
}
void MetalDeviceQueue::close_compute_encoder()
{
[mtlComputeEncoder endEncoding];
mtlComputeEncoder = nil;
}
void MetalDeviceQueue::close_blit_encoder()
{
if (mtlBlitEncoder) {
[mtlBlitEncoder endEncoding];
mtlBlitEncoder = nil;
}
}
CCL_NAMESPACE_END
#endif /* WITH_METAL */

View File

@@ -0,0 +1,101 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef WITH_METAL
# include <Metal/Metal.h>
# include <string>
# include "device/metal/device.h"
# include "device/metal/kernel.h"
# include "device/queue.h"
# include "util/thread.h"
CCL_NAMESPACE_BEGIN
enum MetalGPUVendor {
METAL_GPU_UNKNOWN = 0,
METAL_GPU_APPLE = 1,
METAL_GPU_AMD = 2,
METAL_GPU_INTEL = 3,
};
/* Retains a named MTLDevice for device enumeration. */
struct MetalPlatformDevice {
MetalPlatformDevice(id<MTLDevice> device, const string &device_name)
: device_id(device), device_name(device_name)
{
[device_id retain];
}
~MetalPlatformDevice()
{
[device_id release];
}
id<MTLDevice> device_id;
string device_name;
};
/* Contains static Metal helper functions. */
struct MetalInfo {
static bool device_version_check(id<MTLDevice> device);
static void get_usable_devices(vector<MetalPlatformDevice> *usable_devices);
static MetalGPUVendor get_vendor_from_device_name(string const &device_name);
/* Platform information. */
static bool get_num_devices(uint32_t *num_platforms);
static uint32_t get_num_devices();
static bool get_device_name(id<MTLDevice> device_id, string *device_name);
static string get_device_name(id<MTLDevice> device_id);
};
/* Pool of MTLBuffers whose lifetime is linked to a single MTLCommandBuffer */
class MetalBufferPool {
struct MetalBufferListEntry {
MetalBufferListEntry(id<MTLBuffer> buffer, id<MTLCommandBuffer> command_buffer)
: buffer(buffer), command_buffer(command_buffer)
{
}
MetalBufferListEntry() = delete;
id<MTLBuffer> buffer;
id<MTLCommandBuffer> command_buffer;
};
std::vector<MetalBufferListEntry> buffer_free_list;
std::vector<MetalBufferListEntry> buffer_in_use_list;
thread_mutex buffer_mutex;
size_t total_temp_mem_size = 0;
public:
MetalBufferPool() = default;
~MetalBufferPool();
id<MTLBuffer> get_buffer(id<MTLDevice> device,
id<MTLCommandBuffer> command_buffer,
NSUInteger length,
MTLResourceOptions options,
const void *pointer,
Stats &stats);
void process_command_buffer_completion(id<MTLCommandBuffer> command_buffer);
};
CCL_NAMESPACE_END
#endif /* WITH_METAL */

View File

@@ -0,0 +1,241 @@
/*
* Copyright 2021 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef WITH_METAL
# include "device/metal/util.h"
# include "device/metal/device_impl.h"
# include "util/md5.h"
# include "util/path.h"
# include "util/string.h"
# include "util/time.h"
# include <pwd.h>
# include <sys/shm.h>
# include <time.h>
CCL_NAMESPACE_BEGIN
MetalGPUVendor MetalInfo::get_vendor_from_device_name(string const &device_name)
{
if (device_name.find("Intel") != string::npos) {
return METAL_GPU_INTEL;
}
else if (device_name.find("AMD") != string::npos) {
return METAL_GPU_AMD;
}
else if (device_name.find("Apple") != string::npos) {
return METAL_GPU_APPLE;
}
return METAL_GPU_UNKNOWN;
}
bool MetalInfo::device_version_check(id<MTLDevice> device)
{
if (@available(macos 12.0, *)) {
MetalGPUVendor vendor = get_vendor_from_device_name([[device name] UTF8String]);
static const char *forceIntelStr = getenv("CYCLES_METAL_FORCE_INTEL");
bool forceIntel = forceIntelStr ? (atoi(forceIntelStr) != 0) : false;
if (forceIntel) {
/* return false for non-Intel GPUs to force selection of Intel */
if (vendor == METAL_GPU_INTEL) {
return true;
}
}
else {
switch (vendor) {
case METAL_GPU_INTEL:
/* isLowPower only returns true on machines that have an AMD GPU also
* For Intel only machines - isLowPower will return false
*/
if (getenv("CYCLES_METAL_ALLOW_LOW_POWER_GPUS") || !device.isLowPower) {
return true;
}
return false;
case METAL_GPU_APPLE:
case METAL_GPU_AMD:
return true;
default:
return false;
}
}
}
return false;
}
void MetalInfo::get_usable_devices(vector<MetalPlatformDevice> *usable_devices)
{
static bool first_time = true;
# define FIRST_VLOG(severity) \
if (first_time) \
VLOG(severity)
usable_devices->clear();
NSArray<id<MTLDevice>> *allDevices = MTLCopyAllDevices();
for (id<MTLDevice> device in allDevices) {
string device_name;
if (!get_device_name(device, &device_name)) {
FIRST_VLOG(2) << "Failed to get device name, ignoring.";
continue;
}
static const char *forceIntelStr = getenv("CYCLES_METAL_FORCE_INTEL");
bool forceIntel = forceIntelStr ? (atoi(forceIntelStr) != 0) : false;
if (forceIntel && device_name.find("Intel") == string::npos) {
FIRST_VLOG(2) << "CYCLES_METAL_FORCE_INTEL causing non-Intel device " << device_name
<< " to be ignored.";
continue;
}
if (!device_version_check(device)) {
FIRST_VLOG(2) << "Ignoring device " << device_name << " due to too old compiler version.";
continue;
}
FIRST_VLOG(2) << "Adding new device " << device_name << ".";
string hardware_id;
usable_devices->push_back(MetalPlatformDevice(device, device_name));
}
first_time = false;
}
bool MetalInfo::get_num_devices(uint32_t *num_devices)
{
*num_devices = MTLCopyAllDevices().count;
return true;
}
uint32_t MetalInfo::get_num_devices()
{
uint32_t num_devices;
if (!get_num_devices(&num_devices)) {
return 0;
}
return num_devices;
}
bool MetalInfo::get_device_name(id<MTLDevice> device, string *platform_name)
{
*platform_name = [device.name UTF8String];
return true;
}
string MetalInfo::get_device_name(id<MTLDevice> device)
{
string platform_name;
if (!get_device_name(device, &platform_name)) {
return "";
}
return platform_name;
}
id<MTLBuffer> MetalBufferPool::get_buffer(id<MTLDevice> device,
id<MTLCommandBuffer> command_buffer,
NSUInteger length,
MTLResourceOptions options,
const void *pointer,
Stats &stats)
{
id<MTLBuffer> buffer;
MTLStorageMode storageMode = MTLStorageMode((options & MTLResourceStorageModeMask) >>
MTLResourceStorageModeShift);
MTLCPUCacheMode cpuCacheMode = MTLCPUCacheMode((options & MTLResourceCPUCacheModeMask) >>
MTLResourceCPUCacheModeShift);
buffer_mutex.lock();
for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end(); entry++) {
MetalBufferListEntry bufferEntry = *entry;
/* Check if buffer matches size and storage mode and is old enough to reuse */
if (bufferEntry.buffer.length == length && storageMode == bufferEntry.buffer.storageMode &&
cpuCacheMode == bufferEntry.buffer.cpuCacheMode) {
buffer = bufferEntry.buffer;
buffer_free_list.erase(entry);
bufferEntry.command_buffer = command_buffer;
buffer_in_use_list.push_back(bufferEntry);
buffer_mutex.unlock();
/* Copy over data */
if (pointer) {
memcpy(buffer.contents, pointer, length);
if (bufferEntry.buffer.storageMode == MTLStorageModeManaged) {
[buffer didModifyRange:NSMakeRange(0, length)];
}
}
return buffer;
}
}
// NSLog(@"Creating buffer of length %lu (%lu)", length, frameCount);
if (pointer) {
buffer = [device newBufferWithBytes:pointer length:length options:options];
}
else {
buffer = [device newBufferWithLength:length options:options];
}
MetalBufferListEntry buffer_entry(buffer, command_buffer);
stats.mem_alloc(buffer.allocatedSize);
total_temp_mem_size += buffer.allocatedSize;
buffer_in_use_list.push_back(buffer_entry);
buffer_mutex.unlock();
return buffer;
}
void MetalBufferPool::process_command_buffer_completion(id<MTLCommandBuffer> command_buffer)
{
assert(command_buffer);
thread_scoped_lock lock(buffer_mutex);
/* Release all buffers that have not been recently reused back into the free pool */
for (auto entry = buffer_in_use_list.begin(); entry != buffer_in_use_list.end();) {
MetalBufferListEntry buffer_entry = *entry;
if (buffer_entry.command_buffer == command_buffer) {
entry = buffer_in_use_list.erase(entry);
buffer_entry.command_buffer = nil;
buffer_free_list.push_back(buffer_entry);
}
else {
entry++;
}
}
}
MetalBufferPool::~MetalBufferPool()
{
thread_scoped_lock lock(buffer_mutex);
/* Release all buffers that have not been recently reused */
for (auto entry = buffer_free_list.begin(); entry != buffer_free_list.end();) {
MetalBufferListEntry buffer_entry = *entry;
id<MTLBuffer> buffer = buffer_entry.buffer;
// NSLog(@"Releasing buffer of length %lu (%lu) (%lu outstanding)", buffer.length, frameCount,
// bufferFreeList.size());
total_temp_mem_size -= buffer.allocatedSize;
[buffer release];
entry = buffer_free_list.erase(entry);
}
}
CCL_NAMESPACE_END
#endif /* WITH_METAL */

View File

@@ -182,6 +182,9 @@ class MultiDevice : public Device {
else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
params.bvh_layout = sub.device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
BVH_LAYOUT_EMBREE;
else if (bvh->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE)
params.bvh_layout = sub.device->info.type == DEVICE_METAL ? BVH_LAYOUT_METAL :
BVH_LAYOUT_EMBREE;
/* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
* (since they are put into the top level directly, see bvh_embree.cpp) */

View File

@@ -19,7 +19,6 @@
#include "kernel/device/gpu/parallel_active_index.h"
#include "kernel/device/gpu/parallel_prefix_sum.h"
#include "kernel/device/gpu/parallel_sorted_index.h"
#include "kernel/device/gpu/work_stealing.h"
#include "kernel/sample/lcg.h"
@@ -30,6 +29,8 @@
# include "kernel/device/metal/context_begin.h"
#endif
#include "kernel/device/gpu/work_stealing.h"
#include "kernel/integrator/state.h"
#include "kernel/integrator/state_flow.h"
#include "kernel/integrator/state_util.h"
@@ -96,7 +97,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int state = tile->path_index_offset + tile_work_index;
uint x, y, sample;
get_work_pixel(tile, tile_work_index, &x, &y, &sample);
ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
ccl_gpu_kernel_call(
integrator_init_from_camera(nullptr, state, tile, render_buffer, x, y, sample));
@@ -127,7 +128,7 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
const int state = tile->path_index_offset + tile_work_index;
uint x, y, sample;
get_work_pixel(tile, tile_work_index, &x, &y, &sample);
ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample));
ccl_gpu_kernel_call(
integrator_init_from_bake(nullptr, state, tile, render_buffer, x, y, sample));

View File

@@ -117,7 +117,7 @@ struct kernel_gpu_##name \
uint simd_group_index, \
uint num_simd_groups) ccl_global const; \
}; \
kernel void kernel_metal_##name(device const kernel_gpu_##name *params_struct, \
kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
constant KernelParamsMetal &ccl_restrict _launch_params_metal, \
constant MetalAncillaries *_metal_ancillaries, \
threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \

View File

@@ -126,7 +126,7 @@ TReturn metalrt_local_hit(constant KernelParamsMetal &launch_params_metal,
[[intersection(triangle, triangle_data, METALRT_TAGS)]]
TriangleIntersectionResult
__anyhit__kernel_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
__anyhit__cycles_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
ray_data MetalKernelContext::MetalRTIntersectionLocalPayload &payload [[payload]],
uint instance_id [[user_instance_id]],
uint primitive_id [[primitive_id]],
@@ -139,7 +139,7 @@ __anyhit__kernel_metalrt_local_hit_tri(constant KernelParamsMetal &launch_params
[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
BoundingBoxIntersectionResult
__anyhit__kernel_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
__anyhit__cycles_metalrt_local_hit_box(const float ray_tmax [[max_distance]])
{
/* unused function */
BoundingBoxIntersectionResult result;
@@ -274,7 +274,7 @@ bool metalrt_shadow_all_hit(constant KernelParamsMetal &launch_params_metal,
[[intersection(triangle, triangle_data, METALRT_TAGS)]]
TriangleIntersectionResult
__anyhit__kernel_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
__anyhit__cycles_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
ray_data MetalKernelContext::MetalRTIntersectionShadowPayload &payload [[payload]],
unsigned int object [[user_instance_id]],
unsigned int primitive_id [[primitive_id]],
@@ -292,7 +292,7 @@ __anyhit__kernel_metalrt_shadow_all_hit_tri(constant KernelParamsMetal &launch_p
[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
BoundingBoxIntersectionResult
__anyhit__kernel_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
__anyhit__cycles_metalrt_shadow_all_hit_box(const float ray_tmax [[max_distance]])
{
/* unused function */
BoundingBoxIntersectionResult result;
@@ -345,7 +345,7 @@ inline TReturnType metalrt_visibility_test(constant KernelParamsMetal &launch_pa
[[intersection(triangle, triangle_data, METALRT_TAGS)]]
TriangleIntersectionResult
__anyhit__kernel_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
__anyhit__cycles_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_params_metal [[buffer(1)]],
ray_data MetalKernelContext::MetalRTIntersectionPayload &payload [[payload]],
unsigned int object [[user_instance_id]],
unsigned int primitive_id [[primitive_id]])
@@ -362,7 +362,7 @@ __anyhit__kernel_metalrt_visibility_test_tri(constant KernelParamsMetal &launch_
[[intersection(bounding_box, triangle_data, METALRT_TAGS)]]
BoundingBoxIntersectionResult
__anyhit__kernel_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
__anyhit__cycles_metalrt_visibility_test_box(const float ray_tmax [[max_distance]])
{
/* Unused function */
BoundingBoxIntersectionResult result;

View File

@@ -719,6 +719,20 @@ ccl_device_inline float pow22(float a)
return sqr(a * sqr(sqr(sqr(a)) * a));
}
#ifdef __KERNEL_METAL__
ccl_device_inline float lgammaf(float x)
{
/* Nemes, Gergő (2010), "New asymptotic expansion for the Gamma function", Archiv der Mathematik
*/
const float _1_180 = 1.0f / 180.0f;
const float log2pi = 1.83787706641f;
const float logx = log(x);
return (log2pi - logx +
x * (logx * 2.0f + log(x * sinh(1.0f / x) + (_1_180 / pow(x, 6.0f))) - 2.0f)) *
0.5f;
}
#endif
ccl_device_inline float beta(float x, float y)
{
return expf(lgammaf(x) + lgammaf(y) - lgammaf(x + y));

View File

@@ -750,6 +750,170 @@ bool path_remove(const string &path)
return remove(path.c_str()) == 0;
}
struct SourceReplaceState {
typedef map<string, string> ProcessedMapping;
/* Base director for all relative include headers. */
string base;
/* Result of processed files. */
ProcessedMapping processed_files;
/* Set of files containing #pragma once which have been included. */
set<string> pragma_onced;
};
static string path_source_replace_includes_recursive(const string &source,
const string &source_filepath,
SourceReplaceState *state);
static string path_source_handle_preprocessor(const string &preprocessor_line,
const string &source_filepath,
const size_t line_number,
SourceReplaceState *state)
{
string result = preprocessor_line;
string rest_of_line = string_strip(preprocessor_line.substr(1));
if (0 == strncmp(rest_of_line.c_str(), "include", 7)) {
rest_of_line = string_strip(rest_of_line.substr(8));
if (rest_of_line[0] == '"') {
const size_t n_start = 1;
const size_t n_end = rest_of_line.find("\"", n_start);
const string filename = rest_of_line.substr(n_start, n_end - n_start);
string filepath = path_join(state->base, filename);
if (!path_exists(filepath)) {
filepath = path_join(path_dirname(source_filepath), filename);
}
string text;
if (path_read_text(filepath, text)) {
text = path_source_replace_includes_recursive(text, filepath, state);
/* Use line directives for better error messages. */
return "\n" + text + "\n";
}
}
}
return result;
}
/* Our own little c preprocessor that replaces #includes with the file
* contents, to work around issue of OpenCL drivers not supporting
* include paths with spaces in them.
*/
static string path_source_replace_includes_recursive(const string &_source,
const string &source_filepath,
SourceReplaceState *state)
{
const string *psource = &_source;
string source_new;
auto pragma_once = _source.find("#pragma once");
if (pragma_once != string::npos) {
if (state->pragma_onced.find(source_filepath) != state->pragma_onced.end()) {
return "";
}
state->pragma_onced.insert(source_filepath);
// "#pragma once"
// "//prgma once"
source_new = _source;
memcpy(source_new.data() + pragma_once, "//pr", 4);
psource = &source_new;
}
/* Try to re-use processed file without spending time on replacing all
* include directives again.
*/
SourceReplaceState::ProcessedMapping::iterator replaced_file = state->processed_files.find(
source_filepath);
if (replaced_file != state->processed_files.end()) {
return replaced_file->second;
}
const string &source = *psource;
/* Perform full file processing. */
string result = "";
const size_t source_length = source.length();
size_t index = 0;
/* Information about where we are in the source. */
size_t line_number = 0, column_number = 1;
/* Currently gathered non-preprocessor token.
* Store as start/length rather than token itself to avoid overhead of
* memory re-allocations on each character concatenation.
*/
size_t token_start = 0, token_length = 0;
/* Denotes whether we're inside of preprocessor line, together with
* preprocessor line itself.
*
* TODO(sergey): Investigate whether using token start/end position
* gives measurable speedup.
*/
bool inside_preprocessor = false;
string preprocessor_line = "";
/* Actual loop over the whole source. */
while (index < source_length) {
char ch = source[index];
if (ch == '\n') {
if (inside_preprocessor) {
string block = path_source_handle_preprocessor(
preprocessor_line, source_filepath, line_number, state);
if (!block.empty()) {
result += block;
}
/* Start gathering net part of the token. */
token_start = index;
token_length = 0;
inside_preprocessor = false;
preprocessor_line = "";
}
column_number = 0;
++line_number;
}
else if (ch == '#' && column_number == 1 && !inside_preprocessor) {
/* Append all possible non-preprocessor token to the result. */
if (token_length != 0) {
result.append(source, token_start, token_length);
token_start = index;
token_length = 0;
}
inside_preprocessor = true;
}
if (inside_preprocessor) {
preprocessor_line += ch;
}
else {
++token_length;
}
++index;
++column_number;
}
/* Append possible tokens which happened before special events handled
* above.
*/
if (token_length != 0) {
result.append(source, token_start, token_length);
}
if (inside_preprocessor) {
result += path_source_handle_preprocessor(
preprocessor_line, source_filepath, line_number, state);
}
/* Store result for further reuse. */
state->processed_files[source_filepath] = result;
return result;
}
string path_source_replace_includes(const string &source, const string &path)
{
SourceReplaceState state;
state.base = path;
return path_source_replace_includes_recursive(source, path, &state);
}
FILE *path_fopen(const string &path, const string &mode)
{
#ifdef _WIN32

View File

@@ -66,6 +66,9 @@ bool path_read_text(const string &path, string &text);
/* File manipulation. */
bool path_remove(const string &path);
/* source code utility */
string path_source_replace_includes(const string &source, const string &path);
/* cache utility */
void path_cache_clear_except(const string &name, const set<string> &except);