diff --git a/intern/cycles/device/cuda/device_impl.cpp b/intern/cycles/device/cuda/device_impl.cpp index f354ba6aee1..c19a0ade332 100644 --- a/intern/cycles/device/cuda/device_impl.cpp +++ b/intern/cycles/device/cuda/device_impl.cpp @@ -53,8 +53,12 @@ void CUDADevice::set_error(const string &error) } CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) - : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL) + : GPUDevice(info, stats, profiler) { + /* Verify that base class types can be used with specific backend types */ + static_assert(sizeof(texMemObject) == sizeof(CUtexObject)); + static_assert(sizeof(arrayMemObject) == sizeof(CUarray)); + first_error = true; cuDevId = info.num; @@ -65,12 +69,6 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) need_texture_info = false; - device_texture_headroom = 0; - device_working_headroom = 0; - move_texture_to_host = false; - map_host_limit = 0; - map_host_used = 0; - can_map_host = 0; pitch_alignment = 0; /* Initialize CUDA. */ @@ -91,8 +89,9 @@ CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) /* CU_CTX_MAP_HOST for mapping host memory when out of device memory. * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render, * so we can predict which memory to map to host. */ - cuda_assert( - cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + int value; + cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice)); + can_map_host = value != 0; cuda_assert(cuDeviceGetAttribute( &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice)); @@ -499,311 +498,57 @@ void CUDADevice::reserve_local_memory(const uint kernel_features) # endif } -void CUDADevice::init_host_memory() -{ - /* Limit amount of host mapped memory, because allocating too much can - * cause system instability. Leave at least half or 4 GB of system - * memory free, whichever is smaller. */ - size_t default_limit = 4 * 1024 * 1024 * 1024LL; - size_t system_ram = system_physical_ram(); - - if (system_ram > 0) { - if (system_ram / 2 > default_limit) { - map_host_limit = system_ram - default_limit; - } - else { - map_host_limit = system_ram / 2; - } - } - else { - VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM"; - map_host_limit = 0; - } - - /* Amount of device memory to keep is free after texture memory - * and working memory allocations respectively. We set the working - * memory limit headroom lower so that some space is left after all - * texture memory allocations. */ - device_working_headroom = 32 * 1024 * 1024LL; // 32MB - device_texture_headroom = 128 * 1024 * 1024LL; // 128MB - - VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) - << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; -} - -void CUDADevice::load_texture_info() -{ - if (need_texture_info) { - /* Unset flag before copying, so this does not loop indefinitely if the copy below calls - * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ - need_texture_info = false; - texture_info.copy_to_device(); - } -} - -void CUDADevice::move_textures_to_host(size_t size, bool for_texture) -{ - /* Break out of recursive call, which can happen when moving memory on a multi device. */ - static bool any_device_moving_textures_to_host = false; - if (any_device_moving_textures_to_host) { - return; - } - - /* Signal to reallocate textures in host memory only. */ - move_texture_to_host = true; - - while (size > 0) { - /* Find suitable memory allocation to move. */ - device_memory *max_mem = NULL; - size_t max_size = 0; - bool max_is_image = false; - - thread_scoped_lock lock(cuda_mem_map_mutex); - foreach (CUDAMemMap::value_type &pair, cuda_mem_map) { - device_memory &mem = *pair.first; - CUDAMem *cmem = &pair.second; - - /* Can only move textures allocated on this device (and not those from peer devices). - * And need to ignore memory that is already on the host. */ - if (!mem.is_resident(this) || cmem->use_mapped_host) { - continue; - } - - bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && - (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - /* Can't move this type of memory. */ - if (!is_texture || cmem->array) { - continue; - } - - /* For other textures, only move image textures. */ - if (for_texture && !is_image) { - continue; - } - - /* Try to move largest allocation, prefer moving images. */ - if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { - max_is_image = is_image; - max_size = mem.device_size; - max_mem = &mem; - } - } - lock.unlock(); - - /* Move to host memory. This part is mutex protected since - * multiple CUDA devices could be moving the memory. The - * first one will do it, and the rest will adopt the pointer. */ - if (max_mem) { - VLOG_WORK << "Move memory from device to host: " << max_mem->name; - - static thread_mutex move_mutex; - thread_scoped_lock lock(move_mutex); - - any_device_moving_textures_to_host = true; - - /* Potentially need to call back into multi device, so pointer mapping - * and peer devices are updated. This is also necessary since the device - * pointer may just be a key here, so cannot be accessed and freed directly. - * Unfortunately it does mean that memory is reallocated on all other - * devices as well, which is potentially dangerous when still in use (since - * a thread rendering on another devices would only be caught in this mutex - * if it so happens to do an allocation at the same time as well. */ - max_mem->device_copy_to(); - size = (max_size >= size) ? 0 : size - max_size; - - any_device_moving_textures_to_host = false; - } - else { - break; - } - } - - /* Unset flag before texture info is reloaded, since it should stay in device memory. */ - move_texture_to_host = false; - - /* Update texture info array with new pointers. */ - load_texture_info(); -} - -CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding) +void CUDADevice::get_device_memory_info(size_t &total, size_t &free) { CUDAContextScope scope(this); - CUdeviceptr device_pointer = 0; - size_t size = mem.memory_size() + pitch_padding; - - CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY; - const char *status = ""; - - /* First try allocating in device memory, respecting headroom. We make - * an exception for texture info. It is small and frequently accessed, - * so treat it as working memory. - * - * If there is not enough room for working memory, we will try to move - * textures to host memory, assuming the performance impact would have - * been worse for working memory. */ - bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; - - size_t total = 0, free = 0; cuMemGetInfo(&free, &total); - - /* Move textures to host memory if needed. */ - if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { - move_textures_to_host(size + headroom - free, is_texture); - cuMemGetInfo(&free, &total); - } - - /* Allocate in device memory. */ - if (!move_texture_to_host && (size + headroom) < free) { - mem_alloc_result = cuMemAlloc(&device_pointer, size); - if (mem_alloc_result == CUDA_SUCCESS) { - status = " in device memory"; - } - } - - /* Fall back to mapped host memory if needed and possible. */ - - void *shared_pointer = 0; - - if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) { - if (mem.shared_pointer) { - /* Another device already allocated host memory. */ - mem_alloc_result = CUDA_SUCCESS; - shared_pointer = mem.shared_pointer; - } - else if (map_host_used + size < map_host_limit) { - /* Allocate host memory ourselves. */ - mem_alloc_result = cuMemHostAlloc( - &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); - - assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) || - (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0)); - } - - if (mem_alloc_result == CUDA_SUCCESS) { - cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0)); - map_host_used += size; - status = " in host memory"; - } - } - - if (mem_alloc_result != CUDA_SUCCESS) { - if (mem.type == MEM_DEVICE_ONLY) { - status = " failed, out of device memory"; - set_error("System is out of GPU memory"); - } - else { - status = " failed, out of device and host memory"; - set_error("System is out of GPU and shared host memory"); - } - } - - if (mem.name) { - VLOG_WORK << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" << status; - } - - mem.device_pointer = (device_ptr)device_pointer; - mem.device_size = size; - stats.mem_alloc(size); - - if (!mem.device_pointer) { - return NULL; - } - - /* Insert into map of allocations. */ - thread_scoped_lock lock(cuda_mem_map_mutex); - CUDAMem *cmem = &cuda_mem_map[&mem]; - if (shared_pointer != 0) { - /* Replace host pointer with our host allocation. Only works if - * CUDA memory layout is the same and has no pitch padding. Also - * does not work if we move textures to host during a render, - * since other devices might be using the memory. */ - - if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && - mem.host_pointer != shared_pointer) { - memcpy(shared_pointer, mem.host_pointer, size); - - /* A Call to device_memory::host_free() should be preceded by - * a call to device_memory::device_free() for host memory - * allocated by a device to be handled properly. Two exceptions - * are here and a call in OptiXDevice::generic_alloc(), where - * the current host memory can be assumed to be allocated by - * device_memory::host_alloc(), not by a device */ - - mem.host_free(); - mem.host_pointer = shared_pointer; - } - mem.shared_pointer = shared_pointer; - mem.shared_counter++; - cmem->use_mapped_host = true; - } - else { - cmem->use_mapped_host = false; - } - - return cmem; } -void CUDADevice::generic_copy_to(device_memory &mem) +bool CUDADevice::alloc_device(void *&device_pointer, size_t size) { - if (!mem.host_pointer || !mem.device_pointer) { - return; - } + CUDAContextScope scope(this); - /* If use_mapped_host of mem is false, the current device only uses device memory allocated by - * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from - * mem.host_pointer. */ - thread_scoped_lock lock(cuda_mem_map_mutex); - if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { - const CUDAContextScope scope(this); - cuda_assert( - cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size())); - } + CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size); + return mem_alloc_result == CUDA_SUCCESS; } -void CUDADevice::generic_free(device_memory &mem) +void CUDADevice::free_device(void *device_pointer) { - if (mem.device_pointer) { - CUDAContextScope scope(this); - thread_scoped_lock lock(cuda_mem_map_mutex); - DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end()); - const CUDAMem &cmem = cuda_mem_map[&mem]; + CUDAContextScope scope(this); - /* If cmem.use_mapped_host is true, reference counting is used - * to safely free a mapped host memory. */ + cuda_assert(cuMemFree((CUdeviceptr)device_pointer)); +} - if (cmem.use_mapped_host) { - assert(mem.shared_pointer); - if (mem.shared_pointer) { - assert(mem.shared_counter > 0); - if (--mem.shared_counter == 0) { - if (mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = 0; - } - cuMemFreeHost(mem.shared_pointer); - mem.shared_pointer = 0; - } - } - map_host_used -= mem.device_size; - } - else { - /* Free device memory. */ - cuda_assert(cuMemFree(mem.device_pointer)); - } +bool CUDADevice::alloc_host(void *&shared_pointer, size_t size) +{ + CUDAContextScope scope(this); - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; + CUresult mem_alloc_result = cuMemHostAlloc( + &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED); + return mem_alloc_result == CUDA_SUCCESS; +} - cuda_mem_map.erase(cuda_mem_map.find(&mem)); - } +void CUDADevice::free_host(void *shared_pointer) +{ + CUDAContextScope scope(this); + + cuMemFreeHost(shared_pointer); +} + +bool CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer) +{ + CUDAContextScope scope(this); + + cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0)); + return true; +} + +void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) +{ + const CUDAContextScope scope(this); + + cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size)); } void CUDADevice::mem_alloc(device_memory &mem) @@ -868,8 +613,8 @@ void CUDADevice::mem_zero(device_memory &mem) /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory * regardless of mem.host_pointer and mem.shared_pointer. */ - thread_scoped_lock lock(cuda_mem_map_mutex); - if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + thread_scoped_lock lock(device_mem_map_mutex); + if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { const CUDAContextScope scope(this); cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size())); } @@ -994,19 +739,19 @@ void CUDADevice::tex_alloc(device_texture &mem) return; } - CUDAMem *cmem = NULL; + Mem *cmem = NULL; CUarray array_3d = NULL; size_t src_pitch = mem.data_width * dsize * mem.data_elements; size_t dst_pitch = src_pitch; if (!mem.is_resident(this)) { - thread_scoped_lock lock(cuda_mem_map_mutex); - cmem = &cuda_mem_map[&mem]; + thread_scoped_lock lock(device_mem_map_mutex); + cmem = &device_mem_map[&mem]; cmem->texobject = 0; if (mem.data_depth > 1) { array_3d = (CUarray)mem.device_pointer; - cmem->array = array_3d; + cmem->array = reinterpret_cast(array_3d); } else if (mem.data_height > 0) { dst_pitch = align_up(src_pitch, pitch_alignment); @@ -1050,10 +795,10 @@ void CUDADevice::tex_alloc(device_texture &mem) mem.device_size = size; stats.mem_alloc(size); - thread_scoped_lock lock(cuda_mem_map_mutex); - cmem = &cuda_mem_map[&mem]; + thread_scoped_lock lock(device_mem_map_mutex); + cmem = &device_mem_map[&mem]; cmem->texobject = 0; - cmem->array = array_3d; + cmem->array = reinterpret_cast(array_3d); } else if (mem.data_height > 0) { /* 2D texture, using pitch aligned linear memory. */ @@ -1137,8 +882,8 @@ void CUDADevice::tex_alloc(device_texture &mem) texDesc.filterMode = filter_mode; texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; - thread_scoped_lock lock(cuda_mem_map_mutex); - cmem = &cuda_mem_map[&mem]; + thread_scoped_lock lock(device_mem_map_mutex); + cmem = &device_mem_map[&mem]; cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); @@ -1153,9 +898,9 @@ void CUDADevice::tex_free(device_texture &mem) { if (mem.device_pointer) { CUDAContextScope scope(this); - thread_scoped_lock lock(cuda_mem_map_mutex); - DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end()); - const CUDAMem &cmem = cuda_mem_map[&mem]; + thread_scoped_lock lock(device_mem_map_mutex); + DCHECK(device_mem_map.find(&mem) != device_mem_map.end()); + const Mem &cmem = device_mem_map[&mem]; if (cmem.texobject) { /* Free bindless texture. */ @@ -1164,16 +909,16 @@ void CUDADevice::tex_free(device_texture &mem) if (!mem.is_resident(this)) { /* Do not free memory here, since it was allocated on a different device. */ - cuda_mem_map.erase(cuda_mem_map.find(&mem)); + device_mem_map.erase(device_mem_map.find(&mem)); } else if (cmem.array) { /* Free array. */ - cuArrayDestroy(cmem.array); + cuArrayDestroy(reinterpret_cast(cmem.array)); stats.mem_free(mem.device_size); mem.device_pointer = 0; mem.device_size = 0; - cuda_mem_map.erase(cuda_mem_map.find(&mem)); + device_mem_map.erase(device_mem_map.find(&mem)); } else { lock.unlock(); diff --git a/intern/cycles/device/cuda/device_impl.h b/intern/cycles/device/cuda/device_impl.h index c18f2811161..c8cd9bbdac5 100644 --- a/intern/cycles/device/cuda/device_impl.h +++ b/intern/cycles/device/cuda/device_impl.h @@ -21,7 +21,7 @@ CCL_NAMESPACE_BEGIN class DeviceQueue; -class CUDADevice : public Device { +class CUDADevice : public GPUDevice { friend class CUDAContextScope; @@ -29,36 +29,11 @@ class CUDADevice : public Device { CUdevice cuDevice; CUcontext cuContext; CUmodule cuModule; - size_t device_texture_headroom; - size_t device_working_headroom; - bool move_texture_to_host; - size_t map_host_used; - size_t map_host_limit; - int can_map_host; int pitch_alignment; int cuDevId; int cuDevArchitecture; bool first_error; - struct CUDAMem { - CUDAMem() : texobject(0), array(0), use_mapped_host(false) - { - } - - CUtexObject texobject; - CUarray array; - - /* If true, a mapped host memory in shared_pointer is being used. */ - bool use_mapped_host; - }; - typedef map CUDAMemMap; - CUDAMemMap cuda_mem_map; - thread_mutex cuda_mem_map_mutex; - - /* Bindless Textures */ - device_vector texture_info; - bool need_texture_info; - CUDADeviceKernels kernels; static bool have_precompiled_kernels(); @@ -88,17 +63,13 @@ class CUDADevice : public Device { void reserve_local_memory(const uint kernel_features); - void init_host_memory(); - - void load_texture_info(); - - void move_textures_to_host(size_t size, bool for_texture); - - CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); - - void generic_copy_to(device_memory &mem); - - void generic_free(device_memory &mem); + virtual void get_device_memory_info(size_t &total, size_t &free) override; + virtual bool alloc_device(void *&device_pointer, size_t size) override; + virtual void free_device(void *device_pointer) override; + virtual bool alloc_host(void *&shared_pointer, size_t size) override; + virtual void free_host(void *shared_pointer) override; + virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override; + virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override; void mem_alloc(device_memory &mem) override; diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp index ff7e46d48ab..ed06740021d 100644 --- a/intern/cycles/device/device.cpp +++ b/intern/cycles/device/device.cpp @@ -452,6 +452,320 @@ void *Device::get_cpu_osl_memory() return nullptr; } +GPUDevice::~GPUDevice() noexcept(false) +{ +} + +bool GPUDevice::load_texture_info() +{ + if (need_texture_info) { + /* Unset flag before copying, so this does not loop indefinitely if the copy below calls + * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ + need_texture_info = false; + texture_info.copy_to_device(); + return true; + } + else { + return false; + } +} + +void GPUDevice::init_host_memory(size_t preferred_texture_headroom, + size_t preferred_working_headroom) +{ + /* Limit amount of host mapped memory, because allocating too much can + * cause system instability. Leave at least half or 4 GB of system + * memory free, whichever is smaller. */ + size_t default_limit = 4 * 1024 * 1024 * 1024LL; + size_t system_ram = system_physical_ram(); + + if (system_ram > 0) { + if (system_ram / 2 > default_limit) { + map_host_limit = system_ram - default_limit; + } + else { + map_host_limit = system_ram / 2; + } + } + else { + VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM"; + map_host_limit = 0; + } + + /* Amount of device memory to keep free after texture memory + * and working memory allocations respectively. We set the working + * memory limit headroom lower than the working one so there + * is space left for it. */ + device_working_headroom = preferred_working_headroom > 0 ? preferred_working_headroom : + 32 * 1024 * 1024LL; // 32MB + device_texture_headroom = preferred_texture_headroom > 0 ? preferred_texture_headroom : + 128 * 1024 * 1024LL; // 128MB + + VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) + << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; +} + +void GPUDevice::move_textures_to_host(size_t size, bool for_texture) +{ + /* Break out of recursive call, which can happen when moving memory on a multi device. */ + static bool any_device_moving_textures_to_host = false; + if (any_device_moving_textures_to_host) { + return; + } + + /* Signal to reallocate textures in host memory only. */ + move_texture_to_host = true; + + while (size > 0) { + /* Find suitable memory allocation to move. */ + device_memory *max_mem = NULL; + size_t max_size = 0; + bool max_is_image = false; + + thread_scoped_lock lock(device_mem_map_mutex); + foreach (MemMap::value_type &pair, device_mem_map) { + device_memory &mem = *pair.first; + Mem *cmem = &pair.second; + + /* Can only move textures allocated on this device (and not those from peer devices). + * And need to ignore memory that is already on the host. */ + if (!mem.is_resident(this) || cmem->use_mapped_host) { + continue; + } + + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && + (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + /* Can't move this type of memory. */ + if (!is_texture || cmem->array) { + continue; + } + + /* For other textures, only move image textures. */ + if (for_texture && !is_image) { + continue; + } + + /* Try to move largest allocation, prefer moving images. */ + if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { + max_is_image = is_image; + max_size = mem.device_size; + max_mem = &mem; + } + } + lock.unlock(); + + /* Move to host memory. This part is mutex protected since + * multiple backend devices could be moving the memory. The + * first one will do it, and the rest will adopt the pointer. */ + if (max_mem) { + VLOG_WORK << "Move memory from device to host: " << max_mem->name; + + static thread_mutex move_mutex; + thread_scoped_lock lock(move_mutex); + + any_device_moving_textures_to_host = true; + + /* Potentially need to call back into multi device, so pointer mapping + * and peer devices are updated. This is also necessary since the device + * pointer may just be a key here, so cannot be accessed and freed directly. + * Unfortunately it does mean that memory is reallocated on all other + * devices as well, which is potentially dangerous when still in use (since + * a thread rendering on another devices would only be caught in this mutex + * if it so happens to do an allocation at the same time as well. */ + max_mem->device_copy_to(); + size = (max_size >= size) ? 0 : size - max_size; + + any_device_moving_textures_to_host = false; + } + else { + break; + } + } + + /* Unset flag before texture info is reloaded, since it should stay in device memory. */ + move_texture_to_host = false; + + /* Update texture info array with new pointers. */ + load_texture_info(); +} + +GPUDevice::Mem *GPUDevice::generic_alloc(device_memory &mem, size_t pitch_padding) +{ + void *device_pointer = 0; + size_t size = mem.memory_size() + pitch_padding; + + bool mem_alloc_result = false; + const char *status = ""; + + /* First try allocating in device memory, respecting headroom. We make + * an exception for texture info. It is small and frequently accessed, + * so treat it as working memory. + * + * If there is not enough room for working memory, we will try to move + * textures to host memory, assuming the performance impact would have + * been worse for working memory. */ + bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); + bool is_image = is_texture && (mem.data_height > 1); + + size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; + + size_t total = 0, free = 0; + get_device_memory_info(total, free); + + /* Move textures to host memory if needed. */ + if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { + move_textures_to_host(size + headroom - free, is_texture); + get_device_memory_info(total, free); + } + + /* Allocate in device memory. */ + if (!move_texture_to_host && (size + headroom) < free) { + mem_alloc_result = alloc_device(device_pointer, size); + if (mem_alloc_result) { + device_mem_in_use += size; + status = " in device memory"; + } + } + + /* Fall back to mapped host memory if needed and possible. */ + + void *shared_pointer = 0; + + if (!mem_alloc_result && can_map_host && mem.type != MEM_DEVICE_ONLY) { + if (mem.shared_pointer) { + /* Another device already allocated host memory. */ + mem_alloc_result = true; + shared_pointer = mem.shared_pointer; + } + else if (map_host_used + size < map_host_limit) { + /* Allocate host memory ourselves. */ + mem_alloc_result = alloc_host(shared_pointer, size); + + assert((mem_alloc_result && shared_pointer != 0) || + (!mem_alloc_result && shared_pointer == 0)); + } + + if (mem_alloc_result) { + assert(transform_host_pointer(&device_pointer, shared_pointer)); + map_host_used += size; + status = " in host memory"; + } + } + + if (!mem_alloc_result) { + if (mem.type == MEM_DEVICE_ONLY) { + status = " failed, out of device memory"; + set_error("System is out of GPU memory"); + } + else { + status = " failed, out of device and host memory"; + set_error("System is out of GPU and shared host memory"); + } + } + + if (mem.name) { + VLOG_WORK << "Buffer allocate: " << mem.name << ", " + << string_human_readable_number(mem.memory_size()) << " bytes. (" + << string_human_readable_size(mem.memory_size()) << ")" << status; + } + + mem.device_pointer = (device_ptr)device_pointer; + mem.device_size = size; + stats.mem_alloc(size); + + if (!mem.device_pointer) { + return NULL; + } + + /* Insert into map of allocations. */ + thread_scoped_lock lock(device_mem_map_mutex); + Mem *cmem = &device_mem_map[&mem]; + if (shared_pointer != 0) { + /* Replace host pointer with our host allocation. Only works if + * memory layout is the same and has no pitch padding. Also + * does not work if we move textures to host during a render, + * since other devices might be using the memory. */ + + if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && + mem.host_pointer != shared_pointer) { + memcpy(shared_pointer, mem.host_pointer, size); + + /* A Call to device_memory::host_free() should be preceded by + * a call to device_memory::device_free() for host memory + * allocated by a device to be handled properly. Two exceptions + * are here and a call in OptiXDevice::generic_alloc(), where + * the current host memory can be assumed to be allocated by + * device_memory::host_alloc(), not by a device */ + + mem.host_free(); + mem.host_pointer = shared_pointer; + } + mem.shared_pointer = shared_pointer; + mem.shared_counter++; + cmem->use_mapped_host = true; + } + else { + cmem->use_mapped_host = false; + } + + return cmem; +} + +void GPUDevice::generic_free(device_memory &mem) +{ + if (mem.device_pointer) { + thread_scoped_lock lock(device_mem_map_mutex); + DCHECK(device_mem_map.find(&mem) != device_mem_map.end()); + const Mem &cmem = device_mem_map[&mem]; + + /* If cmem.use_mapped_host is true, reference counting is used + * to safely free a mapped host memory. */ + + if (cmem.use_mapped_host) { + assert(mem.shared_pointer); + if (mem.shared_pointer) { + assert(mem.shared_counter > 0); + if (--mem.shared_counter == 0) { + if (mem.host_pointer == mem.shared_pointer) { + mem.host_pointer = 0; + } + free_host(mem.shared_pointer); + mem.shared_pointer = 0; + } + } + map_host_used -= mem.device_size; + } + else { + /* Free device memory. */ + free_device((void *)mem.device_pointer); + device_mem_in_use -= mem.device_size; + } + + stats.mem_free(mem.device_size); + mem.device_pointer = 0; + mem.device_size = 0; + + device_mem_map.erase(device_mem_map.find(&mem)); + } +} + +void GPUDevice::generic_copy_to(device_memory &mem) +{ + if (!mem.host_pointer || !mem.device_pointer) { + return; + } + + /* If use_mapped_host of mem is false, the current device only uses device memory allocated by + * backend device allocation regardless of mem.host_pointer and mem.shared_pointer, and should + * copy data from mem.host_pointer. */ + thread_scoped_lock lock(device_mem_map_mutex); + if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + copy_host_to_device((void *)mem.device_pointer, mem.host_pointer, mem.memory_size()); + } +} + /* DeviceInfo */ CCL_NAMESPACE_END diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h index 377c123b035..370ec8b0638 100644 --- a/intern/cycles/device/device.h +++ b/intern/cycles/device/device.h @@ -309,6 +309,93 @@ class Device { static uint devices_initialized_mask; }; +/* Device, which is GPU, with some common functionality for GPU backends */ +class GPUDevice : public Device { + protected: + GPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_) + : Device(info_, stats_, profiler_), + texture_info(this, "texture_info", MEM_GLOBAL), + need_texture_info(false), + can_map_host(false), + map_host_used(0), + map_host_limit(0), + device_texture_headroom(0), + device_working_headroom(0), + device_mem_map(), + device_mem_map_mutex(), + move_texture_to_host(false), + device_mem_in_use(0) + { + } + + public: + virtual ~GPUDevice() noexcept(false); + + /* For GPUs that can use bindless textures in some way or another. */ + device_vector texture_info; + bool need_texture_info; + /* Returns true if the texture info was copied to the device (meaning, some more + * re-initialization might be needed). */ + virtual bool load_texture_info(); + + protected: + /* Memory allocation, only accessed through device_memory. */ + friend class device_memory; + + bool can_map_host; + size_t map_host_used; + size_t map_host_limit; + size_t device_texture_headroom; + size_t device_working_headroom; + typedef unsigned long long texMemObject; + typedef unsigned long long arrayMemObject; + struct Mem { + Mem() : texobject(0), array(0), use_mapped_host(false) + { + } + + texMemObject texobject; + arrayMemObject array; + + /* If true, a mapped host memory in shared_pointer is being used. */ + bool use_mapped_host; + }; + typedef map MemMap; + MemMap device_mem_map; + thread_mutex device_mem_map_mutex; + bool move_texture_to_host; + /* Simple counter which will try to track amount of used device memory */ + size_t device_mem_in_use; + + virtual void init_host_memory(size_t preferred_texture_headroom = 0, + size_t preferred_working_headroom = 0); + virtual void move_textures_to_host(size_t size, bool for_texture); + + /* Allocation, deallocation and copy functions, with coresponding + * support of device/host allocations. */ + virtual GPUDevice::Mem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); + virtual void generic_free(device_memory &mem); + virtual void generic_copy_to(device_memory &mem); + + /* total - amount of device memory, free - amount of available device memory */ + virtual void get_device_memory_info(size_t &total, size_t &free) = 0; + + virtual bool alloc_device(void *&device_pointer, size_t size) = 0; + + virtual void free_device(void *device_pointer) = 0; + + virtual bool alloc_host(void *&shared_pointer, size_t size) = 0; + + virtual void free_host(void *shared_pointer) = 0; + + /* This function should return device pointer coresponding to shared pointer, which + * is host buffer, allocated in `alloc_host`. The function should `true`, if such + * address transformation is possible and `false` overwise */ + virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) = 0; + + virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) = 0; +}; + CCL_NAMESPACE_END #endif /* __DEVICE_H__ */ diff --git a/intern/cycles/device/hip/device_impl.cpp b/intern/cycles/device/hip/device_impl.cpp index 04de8619697..b9bcd7edcab 100644 --- a/intern/cycles/device/hip/device_impl.cpp +++ b/intern/cycles/device/hip/device_impl.cpp @@ -53,8 +53,12 @@ void HIPDevice::set_error(const string &error) } HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) - : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL) + : GPUDevice(info, stats, profiler) { + /* Verify that base class types can be used with specific backend types */ + static_assert(sizeof(texMemObject) == sizeof(hipTextureObject_t)); + static_assert(sizeof(arrayMemObject) == sizeof(hArray)); + first_error = true; hipDevId = info.num; @@ -65,12 +69,6 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) need_texture_info = false; - device_texture_headroom = 0; - device_working_headroom = 0; - move_texture_to_host = false; - map_host_limit = 0; - map_host_used = 0; - can_map_host = 0; pitch_alignment = 0; /* Initialize HIP. */ @@ -91,7 +89,9 @@ HIPDevice::HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler) /* hipDeviceMapHost for mapping host memory when out of device memory. * hipDeviceLmemResizeToMax for reserving local memory ahead of render, * so we can predict which memory to map to host. */ - hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice)); + int value; + hip_assert(hipDeviceGetAttribute(&value, hipDeviceAttributeCanMapHostMemory, hipDevice)); + can_map_host = value != 0; hip_assert( hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice)); @@ -460,305 +460,58 @@ void HIPDevice::reserve_local_memory(const uint kernel_features) # endif } -void HIPDevice::init_host_memory() -{ - /* Limit amount of host mapped memory, because allocating too much can - * cause system instability. Leave at least half or 4 GB of system - * memory free, whichever is smaller. */ - size_t default_limit = 4 * 1024 * 1024 * 1024LL; - size_t system_ram = system_physical_ram(); - - if (system_ram > 0) { - if (system_ram / 2 > default_limit) { - map_host_limit = system_ram - default_limit; - } - else { - map_host_limit = system_ram / 2; - } - } - else { - VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM"; - map_host_limit = 0; - } - - /* Amount of device memory to keep is free after texture memory - * and working memory allocations respectively. We set the working - * memory limit headroom lower so that some space is left after all - * texture memory allocations. */ - device_working_headroom = 32 * 1024 * 1024LL; // 32MB - device_texture_headroom = 128 * 1024 * 1024LL; // 128MB - - VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit) - << " bytes. (" << string_human_readable_size(map_host_limit) << ")"; -} - -void HIPDevice::load_texture_info() -{ - if (need_texture_info) { - /* Unset flag before copying, so this does not loop indefinitely if the copy below calls - * into 'move_textures_to_host' (which calls 'load_texture_info' again). */ - need_texture_info = false; - texture_info.copy_to_device(); - } -} - -void HIPDevice::move_textures_to_host(size_t size, bool for_texture) -{ - /* Break out of recursive call, which can happen when moving memory on a multi device. */ - static bool any_device_moving_textures_to_host = false; - if (any_device_moving_textures_to_host) { - return; - } - - /* Signal to reallocate textures in host memory only. */ - move_texture_to_host = true; - - while (size > 0) { - /* Find suitable memory allocation to move. */ - device_memory *max_mem = NULL; - size_t max_size = 0; - bool max_is_image = false; - - thread_scoped_lock lock(hip_mem_map_mutex); - foreach (HIPMemMap::value_type &pair, hip_mem_map) { - device_memory &mem = *pair.first; - HIPMem *cmem = &pair.second; - - /* Can only move textures allocated on this device (and not those from peer devices). - * And need to ignore memory that is already on the host. */ - if (!mem.is_resident(this) || cmem->use_mapped_host) { - continue; - } - - bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && - (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - /* Can't move this type of memory. */ - if (!is_texture || cmem->array) { - continue; - } - - /* For other textures, only move image textures. */ - if (for_texture && !is_image) { - continue; - } - - /* Try to move largest allocation, prefer moving images. */ - if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) { - max_is_image = is_image; - max_size = mem.device_size; - max_mem = &mem; - } - } - lock.unlock(); - - /* Move to host memory. This part is mutex protected since - * multiple HIP devices could be moving the memory. The - * first one will do it, and the rest will adopt the pointer. */ - if (max_mem) { - VLOG_WORK << "Move memory from device to host: " << max_mem->name; - - static thread_mutex move_mutex; - thread_scoped_lock lock(move_mutex); - - any_device_moving_textures_to_host = true; - - /* Potentially need to call back into multi device, so pointer mapping - * and peer devices are updated. This is also necessary since the device - * pointer may just be a key here, so cannot be accessed and freed directly. - * Unfortunately it does mean that memory is reallocated on all other - * devices as well, which is potentially dangerous when still in use (since - * a thread rendering on another devices would only be caught in this mutex - * if it so happens to do an allocation at the same time as well. */ - max_mem->device_copy_to(); - size = (max_size >= size) ? 0 : size - max_size; - - any_device_moving_textures_to_host = false; - } - else { - break; - } - } - - /* Unset flag before texture info is reloaded, since it should stay in device memory. */ - move_texture_to_host = false; - - /* Update texture info array with new pointers. */ - load_texture_info(); -} - -HIPDevice::HIPMem *HIPDevice::generic_alloc(device_memory &mem, size_t pitch_padding) +void HIPDevice::get_device_memory_info(size_t &total, size_t &free) { HIPContextScope scope(this); - hipDeviceptr_t device_pointer = 0; - size_t size = mem.memory_size() + pitch_padding; - - hipError_t mem_alloc_result = hipErrorOutOfMemory; - const char *status = ""; - - /* First try allocating in device memory, respecting headroom. We make - * an exception for texture info. It is small and frequently accessed, - * so treat it as working memory. - * - * If there is not enough room for working memory, we will try to move - * textures to host memory, assuming the performance impact would have - * been worse for working memory. */ - bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info); - bool is_image = is_texture && (mem.data_height > 1); - - size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom; - - size_t total = 0, free = 0; hipMemGetInfo(&free, &total); - - /* Move textures to host memory if needed. */ - if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) { - move_textures_to_host(size + headroom - free, is_texture); - hipMemGetInfo(&free, &total); - } - - /* Allocate in device memory. */ - if (!move_texture_to_host && (size + headroom) < free) { - mem_alloc_result = hipMalloc(&device_pointer, size); - if (mem_alloc_result == hipSuccess) { - status = " in device memory"; - } - } - - /* Fall back to mapped host memory if needed and possible. */ - - void *shared_pointer = 0; - - if (mem_alloc_result != hipSuccess && can_map_host) { - if (mem.shared_pointer) { - /* Another device already allocated host memory. */ - mem_alloc_result = hipSuccess; - shared_pointer = mem.shared_pointer; - } - else if (map_host_used + size < map_host_limit) { - /* Allocate host memory ourselves. */ - mem_alloc_result = hipHostMalloc( - &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined); - - assert((mem_alloc_result == hipSuccess && shared_pointer != 0) || - (mem_alloc_result != hipSuccess && shared_pointer == 0)); - } - - if (mem_alloc_result == hipSuccess) { - hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0)); - map_host_used += size; - status = " in host memory"; - } - } - - if (mem_alloc_result != hipSuccess) { - status = " failed, out of device and host memory"; - set_error("System is out of GPU and shared host memory"); - } - - if (mem.name) { - VLOG_WORK << "Buffer allocate: " << mem.name << ", " - << string_human_readable_number(mem.memory_size()) << " bytes. (" - << string_human_readable_size(mem.memory_size()) << ")" << status; - } - - mem.device_pointer = (device_ptr)device_pointer; - mem.device_size = size; - stats.mem_alloc(size); - - if (!mem.device_pointer) { - return NULL; - } - - /* Insert into map of allocations. */ - thread_scoped_lock lock(hip_mem_map_mutex); - HIPMem *cmem = &hip_mem_map[&mem]; - if (shared_pointer != 0) { - /* Replace host pointer with our host allocation. Only works if - * HIP memory layout is the same and has no pitch padding. Also - * does not work if we move textures to host during a render, - * since other devices might be using the memory. */ - - if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer && - mem.host_pointer != shared_pointer) { - memcpy(shared_pointer, mem.host_pointer, size); - - /* A Call to device_memory::host_free() should be preceded by - * a call to device_memory::device_free() for host memory - * allocated by a device to be handled properly. Two exceptions - * are here and a call in OptiXDevice::generic_alloc(), where - * the current host memory can be assumed to be allocated by - * device_memory::host_alloc(), not by a device */ - - mem.host_free(); - mem.host_pointer = shared_pointer; - } - mem.shared_pointer = shared_pointer; - mem.shared_counter++; - cmem->use_mapped_host = true; - } - else { - cmem->use_mapped_host = false; - } - - return cmem; } -void HIPDevice::generic_copy_to(device_memory &mem) +bool HIPDevice::alloc_device(void *&device_pointer, size_t size) { - if (!mem.host_pointer || !mem.device_pointer) { - return; - } + HIPContextScope scope(this); - /* If use_mapped_host of mem is false, the current device only uses device memory allocated by - * hipMalloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from - * mem.host_pointer. */ - thread_scoped_lock lock(hip_mem_map_mutex); - if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { - const HIPContextScope scope(this); - hip_assert( - hipMemcpyHtoD((hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size())); - } + hipError_t mem_alloc_result = hipMalloc((hipDeviceptr_t *)&device_pointer, size); + return mem_alloc_result == hipSuccess; } -void HIPDevice::generic_free(device_memory &mem) +void HIPDevice::free_device(void *device_pointer) { - if (mem.device_pointer) { - HIPContextScope scope(this); - thread_scoped_lock lock(hip_mem_map_mutex); - DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end()); - const HIPMem &cmem = hip_mem_map[&mem]; + HIPContextScope scope(this); - /* If cmem.use_mapped_host is true, reference counting is used - * to safely free a mapped host memory. */ + hip_assert(hipFree((hipDeviceptr_t)device_pointer)); +} - if (cmem.use_mapped_host) { - assert(mem.shared_pointer); - if (mem.shared_pointer) { - assert(mem.shared_counter > 0); - if (--mem.shared_counter == 0) { - if (mem.host_pointer == mem.shared_pointer) { - mem.host_pointer = 0; - } - hipHostFree(mem.shared_pointer); - mem.shared_pointer = 0; - } - } - map_host_used -= mem.device_size; - } - else { - /* Free device memory. */ - hip_assert(hipFree(mem.device_pointer)); - } +bool HIPDevice::alloc_host(void *&shared_pointer, size_t size) +{ + HIPContextScope scope(this); - stats.mem_free(mem.device_size); - mem.device_pointer = 0; - mem.device_size = 0; + hipError_t mem_alloc_result = hipHostMalloc( + &shared_pointer, size, hipHostMallocMapped | hipHostMallocWriteCombined); - hip_mem_map.erase(hip_mem_map.find(&mem)); - } + return mem_alloc_result == hipSuccess; +} + +void HIPDevice::free_host(void *shared_pointer) +{ + HIPContextScope scope(this); + + hipHostFree(shared_pointer); +} + +bool HIPDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer) +{ + HIPContextScope scope(this); + + hip_assert(hipHostGetDevicePointer((hipDeviceptr_t *)&device_pointer, shared_pointer, 0)); + return true; +} + +void HIPDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) +{ + const HIPContextScope scope(this); + + hip_assert(hipMemcpyHtoD((hipDeviceptr_t)device_pointer, host_pointer, size)); } void HIPDevice::mem_alloc(device_memory &mem) @@ -823,8 +576,8 @@ void HIPDevice::mem_zero(device_memory &mem) /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory * regardless of mem.host_pointer and mem.shared_pointer. */ - thread_scoped_lock lock(hip_mem_map_mutex); - if (!hip_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { + thread_scoped_lock lock(device_mem_map_mutex); + if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) { const HIPContextScope scope(this); hip_assert(hipMemsetD8((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size())); } @@ -951,19 +704,19 @@ void HIPDevice::tex_alloc(device_texture &mem) return; } - HIPMem *cmem = NULL; + Mem *cmem = NULL; hArray array_3d = NULL; size_t src_pitch = mem.data_width * dsize * mem.data_elements; size_t dst_pitch = src_pitch; if (!mem.is_resident(this)) { - thread_scoped_lock lock(hip_mem_map_mutex); - cmem = &hip_mem_map[&mem]; + thread_scoped_lock lock(device_mem_map_mutex); + cmem = &device_mem_map[&mem]; cmem->texobject = 0; if (mem.data_depth > 1) { array_3d = (hArray)mem.device_pointer; - cmem->array = array_3d; + cmem->array = reinterpret_cast(array_3d); } else if (mem.data_height > 0) { dst_pitch = align_up(src_pitch, pitch_alignment); @@ -1007,10 +760,10 @@ void HIPDevice::tex_alloc(device_texture &mem) mem.device_size = size; stats.mem_alloc(size); - thread_scoped_lock lock(hip_mem_map_mutex); - cmem = &hip_mem_map[&mem]; + thread_scoped_lock lock(device_mem_map_mutex); + cmem = &device_mem_map[&mem]; cmem->texobject = 0; - cmem->array = array_3d; + cmem->array = reinterpret_cast(array_3d); } else if (mem.data_height > 0) { /* 2D texture, using pitch aligned linear memory. */ @@ -1095,8 +848,8 @@ void HIPDevice::tex_alloc(device_texture &mem) texDesc.filterMode = filter_mode; texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES; - thread_scoped_lock lock(hip_mem_map_mutex); - cmem = &hip_mem_map[&mem]; + thread_scoped_lock lock(device_mem_map_mutex); + cmem = &device_mem_map[&mem]; hip_assert(hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL)); @@ -1111,9 +864,9 @@ void HIPDevice::tex_free(device_texture &mem) { if (mem.device_pointer) { HIPContextScope scope(this); - thread_scoped_lock lock(hip_mem_map_mutex); - DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end()); - const HIPMem &cmem = hip_mem_map[&mem]; + thread_scoped_lock lock(device_mem_map_mutex); + DCHECK(device_mem_map.find(&mem) != device_mem_map.end()); + const Mem &cmem = device_mem_map[&mem]; if (cmem.texobject) { /* Free bindless texture. */ @@ -1122,16 +875,16 @@ void HIPDevice::tex_free(device_texture &mem) if (!mem.is_resident(this)) { /* Do not free memory here, since it was allocated on a different device. */ - hip_mem_map.erase(hip_mem_map.find(&mem)); + device_mem_map.erase(device_mem_map.find(&mem)); } else if (cmem.array) { /* Free array. */ - hipArrayDestroy(cmem.array); + hipArrayDestroy(reinterpret_cast(cmem.array)); stats.mem_free(mem.device_size); mem.device_pointer = 0; mem.device_size = 0; - hip_mem_map.erase(hip_mem_map.find(&mem)); + device_mem_map.erase(device_mem_map.find(&mem)); } else { lock.unlock(); diff --git a/intern/cycles/device/hip/device_impl.h b/intern/cycles/device/hip/device_impl.h index efdc15dca79..6039827e58e 100644 --- a/intern/cycles/device/hip/device_impl.h +++ b/intern/cycles/device/hip/device_impl.h @@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN class DeviceQueue; -class HIPDevice : public Device { +class HIPDevice : public GPUDevice { friend class HIPContextScope; @@ -26,36 +26,11 @@ class HIPDevice : public Device { hipDevice_t hipDevice; hipCtx_t hipContext; hipModule_t hipModule; - size_t device_texture_headroom; - size_t device_working_headroom; - bool move_texture_to_host; - size_t map_host_used; - size_t map_host_limit; - int can_map_host; int pitch_alignment; int hipDevId; int hipDevArchitecture; bool first_error; - struct HIPMem { - HIPMem() : texobject(0), array(0), use_mapped_host(false) - { - } - - hipTextureObject_t texobject; - hArray array; - - /* If true, a mapped host memory in shared_pointer is being used. */ - bool use_mapped_host; - }; - typedef map HIPMemMap; - HIPMemMap hip_mem_map; - thread_mutex hip_mem_map_mutex; - - /* Bindless Textures */ - device_vector texture_info; - bool need_texture_info; - HIPDeviceKernels kernels; static bool have_precompiled_kernels(); @@ -81,17 +56,13 @@ class HIPDevice : public Device { virtual bool load_kernels(const uint kernel_features) override; void reserve_local_memory(const uint kernel_features); - void init_host_memory(); - - void load_texture_info(); - - void move_textures_to_host(size_t size, bool for_texture); - - HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0); - - void generic_copy_to(device_memory &mem); - - void generic_free(device_memory &mem); + virtual void get_device_memory_info(size_t &total, size_t &free) override; + virtual bool alloc_device(void *&device_pointer, size_t size) override; + virtual void free_device(void *device_pointer) override; + virtual bool alloc_host(void *&shared_pointer, size_t size) override; + virtual void free_host(void *shared_pointer) override; + virtual bool transform_host_pointer(void *&device_pointer, void *&shared_pointer) override; + virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override; void mem_alloc(device_memory &mem) override; diff --git a/intern/cycles/device/memory.h b/intern/cycles/device/memory.h index 5f44475077e..859f1ead14b 100644 --- a/intern/cycles/device/memory.h +++ b/intern/cycles/device/memory.h @@ -247,6 +247,8 @@ class device_memory { bool is_resident(Device *sub_device) const; protected: + friend class Device; + friend class GPUDevice; friend class CUDADevice; friend class OptiXDevice; friend class HIPDevice;