Alternative Upload geometry data in parallel to multiple GPUs using the "Multi-Device" #107552
|
@ -88,6 +88,7 @@ class CPUDevice : public Device {
|
|||
vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
|
||||
virtual void *get_cpu_osl_memory() override;
|
||||
|
||||
virtual void upload_changed() override {} ;
|
||||
protected:
|
||||
virtual bool load_kernels(uint /*kernel_features*/) override;
|
||||
};
|
||||
|
|
|
@ -645,6 +645,24 @@ void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
|
|||
load_texture_info();
|
||||
}
|
||||
|
||||
void Device::register_buffer(device_memory *mem)
|
||||
{
|
||||
VLOG_INFO << "Register buffer " << mem->name;
|
||||
/* Insert into set of buffers. */
|
||||
thread_scoped_lock lock(device_buffer_mutex);
|
||||
device_buffers.insert(mem);
|
||||
}
|
||||
|
||||
void Device::upload_changed() {
|
||||
for (const auto& buffer : device_buffers) {
|
||||
VLOG_INFO << "Checking " << buffer->name;
|
||||
if(buffer->modified) {
|
||||
VLOG_INFO << "Uploading to " << buffer->name;
|
||||
this->mem_copy_to(*buffer, buffer->device_size, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
|
||||
{
|
||||
void *device_pointer = 0;
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "util/types.h"
|
||||
#include "util/unique_ptr.h"
|
||||
#include "util/vector.h"
|
||||
#include "util/set.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -120,8 +121,9 @@ class DeviceInfo {
|
|||
|
||||
class Device {
|
||||
friend class device_sub_ptr;
|
||||
|
||||
protected:
|
||||
thread_mutex device_buffer_mutex;
|
||||
set<device_memory *> device_buffers;
|
||||
protected:
|
||||
Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
|
||||
: info(info_), stats(stats_), profiler(profiler_)
|
||||
{
|
||||
|
@ -293,6 +295,12 @@ class Device {
|
|||
|
||||
static void free_memory();
|
||||
|
||||
/*
|
||||
* Upload to the device any buffers that have changed
|
||||
*/
|
||||
virtual void upload_changed();
|
||||
|
||||
virtual void register_buffer(device_memory *);
|
||||
protected:
|
||||
/* Memory allocation, only accessed through device_memory. */
|
||||
friend class MultiDevice;
|
||||
|
@ -317,7 +325,7 @@ class Device {
|
|||
static vector<DeviceInfo> hip_devices;
|
||||
static vector<DeviceInfo> metal_devices;
|
||||
static vector<DeviceInfo> oneapi_devices;
|
||||
static uint devices_initialized_mask;
|
||||
static uint devices_initialized_mask;
|
||||
};
|
||||
|
||||
/* Device, which is GPU, with some common functionality for GPU back-ends. */
|
||||
|
@ -348,7 +356,6 @@ class GPUDevice : public Device {
|
|||
/* Returns true if the texture info was copied to the device (meaning, some more
|
||||
* re-initialization might be needed). */
|
||||
virtual bool load_texture_info();
|
||||
|
||||
protected:
|
||||
/* Memory allocation, only accessed through device_memory. */
|
||||
friend class device_memory;
|
||||
|
|
|
@ -36,6 +36,7 @@ class DummyDevice : public Device {
|
|||
virtual void mem_free(device_memory &) override {}
|
||||
|
||||
virtual void const_copy_to(const char *, void *, size_t) override {}
|
||||
virtual void upload_changed() override {}
|
||||
};
|
||||
|
||||
Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
|
||||
|
|
|
@ -45,7 +45,7 @@ void *device_memory::host_alloc(size_t size)
|
|||
}
|
||||
|
||||
void *ptr = device->host_mem_alloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES);
|
||||
|
||||
device->register_buffer(this);
|
||||
if (ptr) {
|
||||
util_guarded_mem_alloc(size);
|
||||
}
|
||||
|
|
|
@ -246,7 +246,8 @@ class device_memory {
|
|||
void *shared_pointer;
|
||||
/* reference counter for shared_pointer */
|
||||
int shared_counter;
|
||||
|
||||
bool modified;
|
||||
|
||||
virtual ~device_memory();
|
||||
|
||||
void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr);
|
||||
|
@ -296,7 +297,6 @@ class device_memory {
|
|||
size_t original_device_size;
|
||||
Device *original_device;
|
||||
bool need_realloc_;
|
||||
bool modified;
|
||||
bool shared_mem;
|
||||
};
|
||||
|
||||
|
|
|
@ -180,6 +180,8 @@ class MetalDevice : public Device {
|
|||
void tex_free(device_texture &mem);
|
||||
|
||||
void flush_delayed_free_list();
|
||||
|
||||
void upload_changed() {};
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
|
|
@ -31,6 +31,45 @@ class MultiDevice : public Device {
|
|||
int peer_island_index = -1;
|
||||
};
|
||||
|
||||
class device_memory_clone : public device_texture {
|
||||
public:
|
||||
device_memory_clone(const device_memory &mem, Device *sub_device, device_ptr sub_device_pointer)
|
||||
: device_texture(sub_device, mem.name, 0, IMAGE_DATA_TYPE_FLOAT,INTERPOLATION_NONE,EXTENSION_REPEAT) //mem.type)
|
||||
{
|
||||
data_type = mem.data_type;
|
||||
data_elements = mem.data_elements;
|
||||
data_size = mem.data_size;
|
||||
device_size = mem.device_size;
|
||||
data_width = mem.data_width;
|
||||
data_height = mem.data_height;
|
||||
data_depth = mem.data_depth;
|
||||
type = mem.type;
|
||||
name = mem.name;
|
||||
|
||||
/* Pointers. */
|
||||
device = sub_device;
|
||||
device_pointer = sub_device_pointer;
|
||||
|
||||
host_pointer = mem.host_pointer;
|
||||
shared_pointer = mem.shared_pointer;
|
||||
/* reference counter for shared_pointer */
|
||||
shared_counter = mem.shared_counter;
|
||||
modified = mem.modified;
|
||||
|
||||
if(type == MEM_TEXTURE) {
|
||||
const device_texture *p_tex = static_cast<const device_texture *>(&mem);
|
||||
memcpy(&info, &(p_tex->info), sizeof(TextureInfo));
|
||||
slot = p_tex->slot;
|
||||
}
|
||||
}
|
||||
|
||||
~device_memory_clone() {
|
||||
// Don't free anything
|
||||
host_pointer = 0;
|
||||
device_pointer = 0;
|
||||
}
|
||||
};
|
||||
|
||||
/* Switch from list to a vector to make the parallel_for easily map to the integer id.
|
||||
Also id now could be used to access the real device pointer more quickly. Also, since
|
||||
the vector reallocates the memory on resize the sub-devices are stored as pointers. */
|
||||
|
@ -317,26 +356,25 @@ class MultiDevice : public Device {
|
|||
|
||||
/* The tile buffers are allocated on each device (see below), so copy to all of them */
|
||||
foreach (const vector<SubDevice *> &island, peer_islands) {
|
||||
//parallel_for_each (peer_islands.begin(), peer_islands.end(), [&](const vector<SubDevice *> &island) {
|
||||
SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
|
||||
mem.device = owner_sub->device.get();
|
||||
mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
|
||||
mem.device_size = existing_size;
|
||||
|
||||
owner_sub->device->mem_copy_to(mem, size, offset);
|
||||
owner_sub->ptr_map[key] = mem.device_pointer;
|
||||
Device *sub_device = owner_sub->device.get();
|
||||
device_ptr sub_device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
|
||||
device_memory_clone sub_mem(mem, sub_device, sub_device_pointer);
|
||||
owner_sub->device->mem_copy_to(sub_mem, size, offset);
|
||||
owner_sub->ptr_map[key] = sub_mem.device_pointer;
|
||||
|
||||
if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
|
||||
/* Need to create texture objects and update pointer in kernel globals on all devices */
|
||||
foreach (SubDevice *island_sub, island) {
|
||||
if (island_sub != owner_sub) {
|
||||
island_sub->device->mem_copy_to(mem, size, offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Need to create texture objects and update pointer in kernel globals on all devices */
|
||||
foreach (SubDevice *island_sub, island) {
|
||||
if (island_sub != owner_sub) {
|
||||
island_sub->device->mem_copy_to(mem, size, offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}//);
|
||||
|
||||
mem.device = this;
|
||||
mem.device_pointer = key;
|
||||
stats.mem_alloc(mem.device_size - existing_size);
|
||||
}
|
||||
|
||||
|
@ -443,6 +481,41 @@ class MultiDevice : public Device {
|
|||
sub->device->foreach_device(callback);
|
||||
}
|
||||
}
|
||||
|
||||
virtual void upload_changed() override
|
||||
{
|
||||
//foreach (const vector<SubDevice *> &island, peer_islands) {
|
||||
parallel_for_each (peer_islands.begin(), peer_islands.end(), [&](const vector<SubDevice *> &island) {
|
||||
for (const device_memory *buffer: device_buffers) {
|
||||
VLOG_INFO << "Checking " << buffer->name << " on " << this;
|
||||
if (buffer->modified) {
|
||||
device_ptr existing_key = buffer->device_pointer;
|
||||
device_ptr key = (existing_key) ? existing_key : unique_key++;
|
||||
size_t existing_size = buffer->device_size;
|
||||
|
||||
SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
|
||||
Device *sub_device = owner_sub->device.get();
|
||||
device_ptr sub_device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
|
||||
device_memory_clone sub_mem(*buffer, sub_device, sub_device_pointer);
|
||||
|
||||
VLOG_INFO << "Uploading to " << buffer->name;
|
||||
owner_sub->device->mem_copy_to(sub_mem, existing_size, 0);
|
||||
owner_sub->ptr_map[key] = sub_mem.device_pointer;
|
||||
|
||||
if (sub_mem.type == MEM_GLOBAL || sub_mem.type == MEM_TEXTURE) {
|
||||
/* Need to create texture objects and update pointer in kernel globals on all devices */
|
||||
foreach (SubDevice *island_sub, island) {
|
||||
if (island_sub != owner_sub) {
|
||||
island_sub->device->mem_copy_to(sub_mem, existing_size, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
stats.mem_alloc(sub_mem.device_size - existing_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
|
||||
|
|
|
@ -713,7 +713,7 @@ void GeometryManager::device_data_xfer_and_bvh_update(int idx,
|
|||
scene->times[idx].mesh = time;
|
||||
}
|
||||
});
|
||||
sub_dscene->device_update_mesh(sub_device, &(scene->geom_sizes), progress);
|
||||
//sub_dscene->device_update_mesh(sub_device, &(scene->geom_sizes), progress);
|
||||
}
|
||||
|
||||
{
|
||||
|
@ -722,7 +722,7 @@ void GeometryManager::device_data_xfer_and_bvh_update(int idx,
|
|||
scene->times[idx].attrib = time;
|
||||
}
|
||||
});
|
||||
sub_dscene->device_update_attributes(sub_device, &(scene->attrib_sizes), progress);
|
||||
//sub_dscene->device_update_attributes(sub_device, &(scene->attrib_sizes), progress);
|
||||
}
|
||||
|
||||
sub_dscene->device_scene_clear_modified();
|
||||
|
@ -909,10 +909,12 @@ void GeometryManager::device_update(Device *device,
|
|||
can_refit_scene_bvh = device_update_bvh_preprocess(device, dscene, scene, progress);
|
||||
}
|
||||
{
|
||||
//device->upload_changed();
|
||||
size_t num_scenes = scene->dscenes.size();
|
||||
VLOG_INFO << "Rendering using " << num_scenes << " devices";
|
||||
/* Parallel upload the geometry data to the devices and
|
||||
calculate or refit the BVHs */
|
||||
device->upload_changed();
|
||||
parallel_for(
|
||||
size_t(0), num_scenes, [=, &progress](const size_t idx) {
|
||||
device_data_xfer_and_bvh_update(idx,
|
||||
|
|
Loading…
Reference in New Issue