9 changed files with 128 additions and 24 deletions
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@ -88,6 +88,7 @@ class CPUDevice : public Device {
      vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
  virtual void *get_cpu_osl_memory() override;

+  virtual void upload_changed() override {} ;
 protected:
  virtual bool load_kernels(uint /*kernel_features*/) override;
 };
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@ -645,6 +645,24 @@ void GPUDevice::move_textures_to_host(size_t size, bool for_texture)
  load_texture_info();
 }

+void Device::register_buffer(device_memory *mem)
+{
+  VLOG_INFO << "Register buffer " << mem->name;
+  /* Insert into set of buffers. */
+  thread_scoped_lock lock(device_buffer_mutex);
+  device_buffers.insert(mem);
+}
+
+void Device::upload_changed() {
+  for (const auto& buffer : device_buffers) {
+    VLOG_INFO << "Checking " << buffer->name;
+    if(buffer->modified) {
+      VLOG_INFO << "Uploading to " << buffer->name;
+      this->mem_copy_to(*buffer, buffer->device_size, 0);
+    }
+  }
+}
+
 GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding)
 {
  void *device_pointer = 0;
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@ -21,6 +21,7 @@
 #include "util/types.h"
 #include "util/unique_ptr.h"
 #include "util/vector.h"
+#include "util/set.h"

 CCL_NAMESPACE_BEGIN

@ -120,8 +121,9 @@ class DeviceInfo {

 class Device {
  friend class device_sub_ptr;
-
- protected:
+  thread_mutex device_buffer_mutex;
+  set<device_memory *> device_buffers;
+protected:
  Device(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_)
      : info(info_), stats(stats_), profiler(profiler_)
  {
@ -293,6 +295,12 @@ class Device {

  static void free_memory();

+  /*
+   * Upload to the device any buffers that have changed
+   */
+  virtual void upload_changed();
+
+  virtual void register_buffer(device_memory *);
 protected:
  /* Memory allocation, only accessed through device_memory. */
  friend class MultiDevice;
@ -317,7 +325,7 @@ class Device {
  static vector<DeviceInfo> hip_devices;
  static vector<DeviceInfo> metal_devices;
  static vector<DeviceInfo> oneapi_devices;
-  static uint devices_initialized_mask;
+  static uint devices_initialized_mask;  
 };

 /* Device, which is GPU, with some common functionality for GPU back-ends. */
@ -348,7 +356,6 @@ class GPUDevice : public Device {
  /* Returns true if the texture info was copied to the device (meaning, some more
   * re-initialization might be needed). */
  virtual bool load_texture_info();
-
 protected:
  /* Memory allocation, only accessed through device_memory. */
  friend class device_memory;
--- a/intern/cycles/device/dummy/device.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@ -36,6 +36,7 @@ class DummyDevice : public Device {
  virtual void mem_free(device_memory &) override {}

  virtual void const_copy_to(const char *, void *, size_t) override {}
+  virtual void upload_changed() override {}
 };

 Device *device_dummy_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
--- a/intern/cycles/device/memory.cpp
+++ b/intern/cycles/device/memory.cpp
@ -45,7 +45,7 @@ void *device_memory::host_alloc(size_t size)
  }

  void *ptr = device->host_mem_alloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES);
-
+  device->register_buffer(this);
  if (ptr) {
    util_guarded_mem_alloc(size);
  }
--- a/intern/cycles/device/memory.h
+++ b/intern/cycles/device/memory.h
@ -246,7 +246,8 @@ class device_memory {
  void *shared_pointer;
  /* reference counter for shared_pointer */
  int shared_counter;
-
+  bool modified;
+  
  virtual ~device_memory();

  void swap_device(Device *new_device, size_t new_device_size, device_ptr new_device_ptr);
@ -296,7 +297,6 @@ class device_memory {
  size_t original_device_size;
  Device *original_device;
  bool need_realloc_;
-  bool modified;
  bool shared_mem;
 };

--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@ -180,6 +180,8 @@ class MetalDevice : public Device {
  void tex_free(device_texture &mem);

  void flush_delayed_free_list();
+
+  void upload_changed() {};
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@ -31,6 +31,45 @@ class MultiDevice : public Device {
    int peer_island_index = -1;
  };

+  class device_memory_clone : public device_texture {
+  public:
+    device_memory_clone(const device_memory &mem, Device *sub_device, device_ptr sub_device_pointer)
+      : device_texture(sub_device, mem.name, 0, IMAGE_DATA_TYPE_FLOAT,INTERPOLATION_NONE,EXTENSION_REPEAT) //mem.type)
+    {
+      data_type = mem.data_type;
+      data_elements = mem.data_elements;
+      data_size = mem.data_size;
+      device_size = mem.device_size;
+      data_width = mem.data_width;
+      data_height = mem.data_height;
+      data_depth = mem.data_depth;
+      type = mem.type;
+      name = mem.name;
+
+      /* Pointers. */
+      device = sub_device;
+      device_pointer = sub_device_pointer;
+
+      host_pointer = mem.host_pointer;
+      shared_pointer = mem.shared_pointer;
+      /* reference counter for shared_pointer */
+      shared_counter = mem.shared_counter;
+      modified = mem.modified;
+
+      if(type == MEM_TEXTURE) {
+	const device_texture *p_tex = static_cast<const device_texture *>(&mem);
+	memcpy(&info, &(p_tex->info), sizeof(TextureInfo));
+	slot = p_tex->slot;
+      }
+    }
+
+    ~device_memory_clone() {
+      // Don't free anything
+      host_pointer = 0;
+      device_pointer = 0;
+    }
+  };
+
  /* Switch from list to a vector to make the parallel_for easily map to the integer id.
     Also id now could be used to access the real device pointer more quickly. Also, since
     the vector reallocates the memory on resize the sub-devices are stored as pointers. */
@ -317,26 +356,25 @@ class MultiDevice : public Device {

    /* The tile buffers are allocated on each device (see below), so copy to all of them */
    foreach (const vector<SubDevice *> &island, peer_islands) {
+    //parallel_for_each (peer_islands.begin(), peer_islands.end(), [&](const vector<SubDevice *> &island) {
      SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);
-      mem.device = owner_sub->device.get();
-      mem.device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
-      mem.device_size = existing_size;

-      owner_sub->device->mem_copy_to(mem, size, offset);
-      owner_sub->ptr_map[key] = mem.device_pointer;
+      Device *sub_device = owner_sub->device.get();
+      device_ptr sub_device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+      device_memory_clone sub_mem(mem, sub_device, sub_device_pointer);
+      owner_sub->device->mem_copy_to(sub_mem, size, offset);
+      owner_sub->ptr_map[key] = sub_mem.device_pointer;

      if (mem.type == MEM_GLOBAL || mem.type == MEM_TEXTURE) {
-        /* Need to create texture objects and update pointer in kernel globals on all devices */
-        foreach (SubDevice *island_sub, island) {
-          if (island_sub != owner_sub) {
-            island_sub->device->mem_copy_to(mem, size, offset);
-          }
-        }
-      }
-    }
+	/* Need to create texture objects and update pointer in kernel globals on all devices */
+	foreach (SubDevice *island_sub, island) {
+	  if (island_sub != owner_sub) {
+	    island_sub->device->mem_copy_to(mem, size, offset);
+	  }
+	}
+      } 
+    }//);

-    mem.device = this;
-    mem.device_pointer = key;
    stats.mem_alloc(mem.device_size - existing_size);
  }

@ -443,6 +481,41 @@ class MultiDevice : public Device {
      sub->device->foreach_device(callback);
    }
  }
+
+  virtual void upload_changed() override
+  {
+    //foreach (const vector<SubDevice *> &island, peer_islands) {
+    parallel_for_each (peer_islands.begin(), peer_islands.end(), [&](const vector<SubDevice *> &island) {
+      for (const device_memory *buffer: device_buffers) {
+	VLOG_INFO << "Checking " << buffer->name << " on " << this;
+        if (buffer->modified) {	  
+	  device_ptr existing_key = buffer->device_pointer;
+	  device_ptr key = (existing_key) ? existing_key : unique_key++;
+	  size_t existing_size = buffer->device_size;
+
+	  SubDevice *owner_sub = find_suitable_mem_device(existing_key, island);	  
+	  Device *sub_device = owner_sub->device.get();
+	  device_ptr sub_device_pointer = (existing_key) ? owner_sub->ptr_map[existing_key] : 0;
+	  device_memory_clone sub_mem(*buffer, sub_device, sub_device_pointer);
+              
+          VLOG_INFO << "Uploading to " << buffer->name;
+	  owner_sub->device->mem_copy_to(sub_mem, existing_size, 0);
+	  owner_sub->ptr_map[key] = sub_mem.device_pointer;
+	  
+	  if (sub_mem.type == MEM_GLOBAL || sub_mem.type == MEM_TEXTURE) {
+	    /* Need to create texture objects and update pointer in kernel globals on all devices */
+	    foreach (SubDevice *island_sub, island) {
+	      if (island_sub != owner_sub) {
+		island_sub->device->mem_copy_to(sub_mem, existing_size, 0);
+	      }
+	    }
+	  }
+	  stats.mem_alloc(sub_mem.device_size - existing_size);
+	}
+      }
+    }
+      );
+  }
 };

 Device *device_multi_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
--- a/intern/cycles/scene/geometry.cpp
+++ b/intern/cycles/scene/geometry.cpp
@ -713,7 +713,7 @@ void GeometryManager::device_data_xfer_and_bvh_update(int idx,
        scene->times[idx].mesh = time;
      }
    });
-    sub_dscene->device_update_mesh(sub_device, &(scene->geom_sizes), progress);
+    //sub_dscene->device_update_mesh(sub_device, &(scene->geom_sizes), progress);
  }

  {
@ -722,7 +722,7 @@ void GeometryManager::device_data_xfer_and_bvh_update(int idx,
        scene->times[idx].attrib = time;
      }
    });
-    sub_dscene->device_update_attributes(sub_device, &(scene->attrib_sizes), progress);
+    //sub_dscene->device_update_attributes(sub_device, &(scene->attrib_sizes), progress);
  }

  sub_dscene->device_scene_clear_modified();
@ -909,10 +909,12 @@ void GeometryManager::device_update(Device *device,
    can_refit_scene_bvh = device_update_bvh_preprocess(device, dscene, scene, progress);
  }
  {
+    //device->upload_changed();
    size_t num_scenes = scene->dscenes.size();
    VLOG_INFO << "Rendering using " << num_scenes << " devices";
    /* Parallel upload the geometry data to the devices and
       calculate or refit the BVHs */
+    device->upload_changed();
    parallel_for(
        size_t(0), num_scenes, [=, &progress](const size_t idx) {
          device_data_xfer_and_bvh_update(idx,