Alternative Upload geometry data in parallel to multiple GPUs using the "Multi-Device" #107552

Open
William Leeson wants to merge 137 commits from leesonw/blender-cluster:upload_changed into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
1 changed files with 98 additions and 103 deletions
Showing only changes of commit 511d878921 - Show all commits

View File

@ -26,74 +26,68 @@ class MultiDevice : public Device {
public:
struct SubDevice {
Stats stats;
Device *device;
unique_ptr<Device> device;
map<device_ptr, device_ptr> ptr_map;
int peer_island_index = -1;
};
// Switch from list to a vector to make the parallel_for easily map to the integer id.
// Also id now could be used to access the real device pointer more quickly. Also, since
// the vector reallocates the memory on resize the sub-devices are stored as pointers.
vector<SubDevice *> devices;
/* Switch from list to a vector to make the parallel_for easily map to the integer id.
Also id now could be used to access the real device pointer more quickly. Also, since
the vector reallocates the memory on resize the sub-devices are stored as pointers. */
vector<unique_ptr<SubDevice>> devices;
device_ptr unique_key;
vector<vector<SubDevice *>> peer_islands;
MultiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler), unique_key(1)
{
int cpu_device_idx = -1;
foreach (const DeviceInfo &subinfo, info.multi_devices) {
/* Always add CPU devices at the back since GPU devices can change
* host memory pointers, which CPU uses as device pointer. */
SubDevice *sub = new SubDevice;
if (subinfo.type == DEVICE_CPU) {
assert(cpu_device_idx == -1);
cpu_device_idx = devices.size();
}
devices.emplace_back(sub);
sub->device = Device::create(subinfo, sub->stats, profiler);
int cpu_device_idx = -1;
foreach (const DeviceInfo &subinfo, info.multi_devices) {
/* Always add CPU devices at the back since GPU devices can change
* host memory pointers, which CPU uses as device pointer. */
unique_ptr<SubDevice> sub = make_unique<SubDevice>();
if (subinfo.type == DEVICE_CPU) {
assert(cpu_device_idx == -1);
cpu_device_idx = devices.size();
}
sub->device = std::unique_ptr<Device>(Device::create(subinfo, sub->stats, profiler));
devices.emplace_back(std::move(sub));
}
/* Swop the CPU device with the last device to ensure the CPU device is the last */
{
int last = devices.size() - 1;
if ((cpu_device_idx != -1) && (cpu_device_idx != last)) {
std::swap(devices[last], devices[cpu_device_idx]);
}
}
/* Build a list of peer islands for the available render devices */
foreach (auto &sub, devices) {
/* First ensure that every device is in at least once peer island */
if (sub->peer_island_index < 0) {
peer_islands.emplace_back();
sub->peer_island_index = (int)peer_islands.size() - 1;
peer_islands[sub->peer_island_index].push_back(sub.get());
}
/* Swop the CPU device with the last device to ensure the CPU device is the last */
{
int last = devices.size() - 1;
if ((cpu_device_idx != -1) && (cpu_device_idx != last)) {
std::swap(devices[last], devices[cpu_device_idx]);
if (!info.has_peer_memory) {
continue;
}
/* Second check peer access between devices and fill up the islands accordingly */
foreach (auto &peer_sub, devices) {
if (peer_sub->peer_island_index < 0 &&
peer_sub->device->info.type == sub->device->info.type &&
peer_sub->device->check_peer_access(sub->device.get())) {
peer_sub->peer_island_index = sub->peer_island_index;
peer_islands[sub->peer_island_index].push_back(peer_sub.get());
}
}
/* Build a list of peer islands for the available render devices */
foreach (SubDevice *sub, devices) {
/* First ensure that every device is in at least once peer island */
if (sub->peer_island_index < 0) {
peer_islands.emplace_back();
sub->peer_island_index = (int)peer_islands.size() - 1;
peer_islands[sub->peer_island_index].push_back(sub);
}
if (!info.has_peer_memory) {
continue;
}
/* Second check peer access between devices and fill up the islands accordingly */
foreach (SubDevice *peer_sub, devices) {
if (peer_sub->peer_island_index < 0 &&
peer_sub->device->info.type == sub->device->info.type &&
peer_sub->device->check_peer_access(sub->device)) {
peer_sub->peer_island_index = sub->peer_island_index;
peer_islands[sub->peer_island_index].push_back(peer_sub);
}
}
}
}
~MultiDevice()
{
foreach (SubDevice *sub, devices) {
delete sub->device;
delete sub;
}
}
~MultiDevice() {}
int get_num_devices() const override
{
return devices.size();
@ -103,7 +97,7 @@ class MultiDevice : public Device {
{
error_msg.clear();
foreach (SubDevice *sub, devices)
foreach (auto &sub, devices)
error_msg += sub->device->error_message();
return error_msg;
@ -113,7 +107,7 @@ class MultiDevice : public Device {
{
BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
foreach (const SubDevice *sub_device, devices) {
foreach (const auto &sub_device, devices) {
BVHLayoutMask device_bvh_layout_mask = sub_device->device->get_bvh_layout_mask();
bvh_layout_mask &= device_bvh_layout_mask;
bvh_layout_mask_all |= device_bvh_layout_mask;
@ -144,7 +138,7 @@ class MultiDevice : public Device {
bool load_kernels(const uint kernel_features) override
{
foreach (SubDevice *sub, devices)
foreach (auto &sub, devices)
if (!sub->device->load_kernels(kernel_features))
return false;
@ -153,14 +147,14 @@ class MultiDevice : public Device {
bool load_osl_kernels() override
{
foreach (SubDevice *sub, devices)
foreach (auto &sub, devices)
if (!sub->device->load_osl_kernels())
return false;
return true;
}
void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) override
void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) override
{
/* Try to build and share a single acceleration structure, if possible */
if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2 || bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
@ -177,38 +171,39 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
bvh_multi->sub_bvhs.resize(devices.size());
/* Broadcast acceleration structure build to all render devices */
parallel_for(size_t(0), devices.size(), [this, &bvh_multi, &dscene, refit, &progress](size_t id) {
// WL: Pointer translation is removed as it is not thread safe. Instead a new method is added
// to retrieve the real device pointer.
SubDevice *sub = devices[id];
parallel_for(
size_t(0), devices.size(), [this, &bvh_multi, &dscene, refit, &progress](size_t id) {
/* Pointer translation is removed as it is not thread safe. Instead a new method is added
to retrieve the real device pointer. */
auto &sub = devices[id];
if (!bvh_multi->sub_bvhs[id]) {
BVHParams params = bvh_multi->params;
if (bvh_multi->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
params.bvh_layout = BVH_LAYOUT_OPTIX;
else if (bvh_multi->params.bvh_layout == BVH_LAYOUT_MULTI_METAL)
params.bvh_layout = BVH_LAYOUT_METAL;
else if (bvh_multi->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
params.bvh_layout = sub->device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
BVH_LAYOUT_EMBREE;
else if (bvh_multi->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE)
params.bvh_layout = sub->device->info.type == DEVICE_METAL ? BVH_LAYOUT_METAL :
BVH_LAYOUT_EMBREE;
if (!bvh_multi->sub_bvhs[id]) {
BVHParams params = bvh_multi->params;
if (bvh_multi->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX)
params.bvh_layout = BVH_LAYOUT_OPTIX;
else if (bvh_multi->params.bvh_layout == BVH_LAYOUT_MULTI_METAL)
params.bvh_layout = BVH_LAYOUT_METAL;
else if (bvh_multi->params.bvh_layout == BVH_LAYOUT_MULTI_OPTIX_EMBREE)
params.bvh_layout = sub->device->info.type == DEVICE_OPTIX ? BVH_LAYOUT_OPTIX :
BVH_LAYOUT_EMBREE;
else if (bvh_multi->params.bvh_layout == BVH_LAYOUT_MULTI_METAL_EMBREE)
params.bvh_layout = sub->device->info.type == DEVICE_METAL ? BVH_LAYOUT_METAL :
BVH_LAYOUT_EMBREE;
/* Skip building a bottom level acceleration structure for non-instanced geometry on Embree
* (since they are put into the top level directly, see bvh_embree.cpp) */
if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
!bvh_multi->geometry[0]->is_instanced()) {
}
else {
bvh_multi->sub_bvhs[id] = BVH::create(
params, bvh_multi->geometry, bvh_multi->objects, sub->device);
}
}
if (bvh_multi->sub_bvhs[id]) {
sub->device->build_bvh(bvh_multi->sub_bvhs[id], dscene, progress, refit);
}
});
/* Skip building a bottom level acceleration structure for non-instanced geometry on
* Embree (since they are put into the top level directly, see bvh_embree.cpp) */
if (!params.top_level && params.bvh_layout == BVH_LAYOUT_EMBREE &&
!bvh_multi->geometry[0]->is_instanced()) {
}
else {
bvh_multi->sub_bvhs[id] = BVH::create(
params, bvh_multi->geometry, bvh_multi->objects, sub->device.get());
}
}
if (bvh_multi->sub_bvhs[id]) {
sub->device->build_bvh(bvh_multi->sub_bvhs[id], dscene, progress, refit);
}
});
}
virtual void *get_cpu_osl_memory() override
@ -224,9 +219,9 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
bool is_resident(device_ptr key, Device *sub_device) override
{
foreach (SubDevice *sub, devices) {
if (sub->device == sub_device) {
return find_matching_mem_device(key, sub)->device == sub_device;
foreach (auto &sub, devices) {
if (sub->device.get() == sub_device) {
return find_matching_mem_device(key, sub.get())->device.get() == sub_device;
}
}
return false;
@ -268,9 +263,9 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
inline device_ptr find_matching_mem(device_ptr key, Device *dev) override
{
device_ptr ptr = 0;
foreach (SubDevice *sub, devices) {
if (sub->device == dev) {
return find_matching_mem_device(key, sub)->ptr_map[key];
foreach (auto &sub, devices) {
if (sub->device.get() == dev) {
return find_matching_mem_device(key, sub.get())->ptr_map[key];
}
}
return ptr;
@ -289,7 +284,7 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
/* The remaining memory types can be distributed across devices */
foreach (const vector<SubDevice *> &island, peer_islands) {
SubDevice *owner_sub = find_suitable_mem_device(key, island);
mem.device = owner_sub->device;
mem.device = owner_sub->device.get();
mem.device_pointer = 0;
mem.device_size = 0;
@ -311,7 +306,7 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
/* The tile buffers are allocated on each device (see below), so copy to all of them */
foreach (const vector<SubDevice *> &island, peer_islands) {
SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
mem.device = owner_sub->device;
mem.device = owner_sub->device.get();
mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
mem.device_size = existing_size;
@ -338,12 +333,12 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
device_ptr key = mem.device_pointer;
size_t i = 0, sub_h = h / devices.size();
foreach (SubDevice *sub, devices) {
foreach (auto &sub, devices) {
size_t sy = y + i * sub_h;
size_t sh = (i == (size_t)devices.size() - 1) ? h - sub_h * i : sub_h;
SubDevice *owner_sub = find_matching_mem_device(key, sub);
mem.device = owner_sub->device;
SubDevice *owner_sub = find_matching_mem_device(key, sub.get());
mem.device = owner_sub->device.get();
mem.device_pointer = owner_sub->ptr_map[key];
owner_sub->device->mem_copy_from(mem, sy, w, sh, elem);
@ -362,7 +357,7 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
foreach (const vector<SubDevice *> &island, peer_islands) {
SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
mem.device = owner_sub->device;
mem.device = owner_sub->device.get();
mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
mem.device_size = existing_size;
@ -385,7 +380,7 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
/* Free memory that was allocated for all devices (see above) on each device */
foreach (const vector<SubDevice *> &island, peer_islands) {
SubDevice *owner_sub = find_matching_mem_device(key, island.front());
mem.device = owner_sub->device;
mem.device = owner_sub->device.get();
mem.device_pointer = owner_sub->ptr_map[key];
mem.device_size = existing_size;
owner_sub->device->mem_free(mem);
@ -413,7 +408,7 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
void const_copy_to(const char *name, void *host, size_t size) override
{
foreach (SubDevice *sub, devices)
foreach (auto &sub, devices)
sub->device->const_copy_to(name, host, size);
}
@ -421,8 +416,8 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
{
int i = 0;
for (const SubDevice *sub : devices) {
if (sub->device == sub_device)
for (const auto &sub : devices) {
if (sub->device.get() == sub_device)
return i;
i++;
}
@ -432,7 +427,7 @@ void build_bvh(BVH *bvh, DeviceScene *dscene, Progress &progress, bool refit) ov
virtual void foreach_device(const function<void(Device *)> &callback) override
{
foreach (SubDevice *sub, devices) {
foreach (auto &sub, devices) {
sub->device->foreach_device(callback);
}
}