|
|
|
@ -11,6 +11,7 @@
|
|
|
|
|
# include "device/oneapi/device_impl.h"
|
|
|
|
|
|
|
|
|
|
# include "util/debug.h"
|
|
|
|
|
# include "util/foreach.h"
|
|
|
|
|
# include "util/log.h"
|
|
|
|
|
|
|
|
|
|
# ifdef WITH_EMBREE_GPU
|
|
|
|
@ -47,18 +48,20 @@ static void queue_error_cb(const char *message, void *user_ptr)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
|
|
|
|
|
: Device(info, stats, profiler),
|
|
|
|
|
: GPUDevice(info, stats, profiler),
|
|
|
|
|
device_queue_(nullptr),
|
|
|
|
|
# ifdef WITH_EMBREE_GPU
|
|
|
|
|
embree_device(nullptr),
|
|
|
|
|
embree_scene(nullptr),
|
|
|
|
|
# endif
|
|
|
|
|
texture_info_(this, "texture_info", MEM_GLOBAL),
|
|
|
|
|
kg_memory_(nullptr),
|
|
|
|
|
kg_memory_device_(nullptr),
|
|
|
|
|
kg_memory_size_(0)
|
|
|
|
|
{
|
|
|
|
|
need_texture_info_ = false;
|
|
|
|
|
/* Verify that base class types can be used with specific backend types */
|
|
|
|
|
static_assert(sizeof(texMemObject) == sizeof(void *));
|
|
|
|
|
static_assert(sizeof(arrayMemObject) == sizeof(void *));
|
|
|
|
|
|
|
|
|
|
use_hardware_raytracing = info.use_hardware_raytracing;
|
|
|
|
|
|
|
|
|
|
oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
|
|
|
|
@ -110,6 +113,18 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
|
|
|
|
|
kg_memory_size_ = globals_segment_size;
|
|
|
|
|
|
|
|
|
|
max_memory_on_device_ = get_memcapacity();
|
|
|
|
|
init_host_memory();
|
|
|
|
|
move_texture_to_host = false;
|
|
|
|
|
can_map_host = true;
|
|
|
|
|
|
|
|
|
|
const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM");
|
|
|
|
|
if (headroom_str != nullptr) {
|
|
|
|
|
const long long override_headroom = (float)atoll(headroom_str);
|
|
|
|
|
device_working_headroom = override_headroom;
|
|
|
|
|
device_texture_headroom = override_headroom;
|
|
|
|
|
}
|
|
|
|
|
VLOG_DEBUG << "oneAPI memory headroom size: "
|
|
|
|
|
<< string_human_readable_size(device_working_headroom);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
OneapiDevice::~OneapiDevice()
|
|
|
|
@ -119,7 +134,7 @@ OneapiDevice::~OneapiDevice()
|
|
|
|
|
rtcReleaseDevice(embree_device);
|
|
|
|
|
# endif
|
|
|
|
|
|
|
|
|
|
texture_info_.free();
|
|
|
|
|
texture_info.free();
|
|
|
|
|
usm_free(device_queue_, kg_memory_);
|
|
|
|
|
usm_free(device_queue_, kg_memory_device_);
|
|
|
|
|
|
|
|
|
@ -166,8 +181,22 @@ void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
|
|
|
|
else {
|
|
|
|
|
bvh_embree->build(progress, &stats, embree_device, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# if RTC_VERSION >= 40302
|
|
|
|
|
thread_scoped_lock lock(scene_data_mutex);
|
|
|
|
|
all_embree_scenes.push_back(bvh_embree->scene);
|
|
|
|
|
# endif
|
|
|
|
|
|
|
|
|
|
if (bvh->params.top_level) {
|
|
|
|
|
embree_scene = bvh_embree->scene;
|
|
|
|
|
# if RTC_VERSION >= 40302
|
|
|
|
|
if (bvh_embree->offload_scenes_to_gpu(all_embree_scenes) == false) {
|
|
|
|
|
set_error(
|
|
|
|
|
string_printf("BVH failed to to migrate to the GPU due to Embree library error (%s)",
|
|
|
|
|
bvh_embree->get_last_error_message()));
|
|
|
|
|
}
|
|
|
|
|
all_embree_scenes.clear();
|
|
|
|
|
# endif
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
@ -176,6 +205,22 @@ void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
|
|
|
|
}
|
|
|
|
|
# endif
|
|
|
|
|
|
|
|
|
|
size_t OneapiDevice::get_free_mem() const
|
|
|
|
|
{
|
|
|
|
|
/* Accurate: Use device info. */
|
|
|
|
|
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
|
|
|
|
|
if (device.has(sycl::aspect::ext_intel_free_memory)) {
|
|
|
|
|
return device.get_info<sycl::ext::intel::info::device::free_memory>();
|
|
|
|
|
}
|
|
|
|
|
/* Estimate: Capacity - in use. */
|
|
|
|
|
else if (device_mem_in_use < max_memory_on_device_) {
|
|
|
|
|
return max_memory_on_device_ - device_mem_in_use;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool OneapiDevice::load_kernels(const uint requested_features)
|
|
|
|
|
{
|
|
|
|
|
assert(device_queue_);
|
|
|
|
@ -208,63 +253,101 @@ bool OneapiDevice::load_kernels(const uint requested_features)
|
|
|
|
|
VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\"";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (is_finished_ok) {
|
|
|
|
|
reserve_private_memory(requested_features);
|
|
|
|
|
is_finished_ok = !have_error();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return is_finished_ok;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OneapiDevice::load_texture_info()
|
|
|
|
|
void OneapiDevice::reserve_private_memory(const uint kernel_features)
|
|
|
|
|
{
|
|
|
|
|
if (need_texture_info_) {
|
|
|
|
|
need_texture_info_ = false;
|
|
|
|
|
texture_info_.copy_to_device();
|
|
|
|
|
size_t free_before = get_free_mem();
|
|
|
|
|
|
|
|
|
|
/* Use the biggest kernel for estimation. */
|
|
|
|
|
const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
|
|
|
|
|
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
|
|
|
|
|
(kernel_features & KERNEL_FEATURE_MNEE) ?
|
|
|
|
|
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE :
|
|
|
|
|
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
unique_ptr<DeviceQueue> queue = gpu_queue_create();
|
|
|
|
|
|
|
|
|
|
device_ptr d_path_index = 0;
|
|
|
|
|
device_ptr d_render_buffer = 0;
|
|
|
|
|
int d_work_size = 0;
|
|
|
|
|
DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
|
|
|
|
|
|
|
|
|
|
queue->init_execution();
|
|
|
|
|
/* Launch of the kernel seems to be sufficient to reserve all
|
|
|
|
|
* needed memory regardless of the execution global size.
|
|
|
|
|
* So, the smallest possible size is used here. */
|
|
|
|
|
queue->enqueue(test_kernel, 1, args);
|
|
|
|
|
queue->synchronize();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t free_after = get_free_mem();
|
|
|
|
|
|
|
|
|
|
VLOG_INFO << "For kernel execution were reserved "
|
|
|
|
|
<< string_human_readable_number(free_before - free_after) << " bytes. ("
|
|
|
|
|
<< string_human_readable_size(free_before - free_after) << ")";
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OneapiDevice::generic_alloc(device_memory &mem)
|
|
|
|
|
void OneapiDevice::get_device_memory_info(size_t &total, size_t &free)
|
|
|
|
|
{
|
|
|
|
|
size_t memory_size = mem.memory_size();
|
|
|
|
|
|
|
|
|
|
/* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then
|
|
|
|
|
* we can use USM host memory.
|
|
|
|
|
* Because of the expected performance impact, implementation of this has had a low priority
|
|
|
|
|
* and is not implemented yet. */
|
|
|
|
|
|
|
|
|
|
assert(device_queue_);
|
|
|
|
|
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
|
|
|
|
|
* and shared. For new project it maybe more beneficial to use USM shared memory, because it
|
|
|
|
|
* provides automatic migration mechanism in order to allow to use the same pointer on host and
|
|
|
|
|
* on device, without need to worry about explicit memory transfer operations. But for
|
|
|
|
|
* Blender/Cycles this type of memory is not very suitable in current application architecture,
|
|
|
|
|
* because Cycles already uses two different pointer for host activity and device activity, and
|
|
|
|
|
* also has to perform all needed memory transfer operations. So, USM device memory
|
|
|
|
|
* type has been used for oneAPI device in order to better fit in Cycles architecture. */
|
|
|
|
|
void *device_pointer = nullptr;
|
|
|
|
|
if (mem.memory_size() + stats.mem_used < max_memory_on_device_)
|
|
|
|
|
device_pointer = usm_alloc_device(device_queue_, memory_size);
|
|
|
|
|
if (device_pointer == nullptr) {
|
|
|
|
|
set_error("oneAPI kernel - device memory allocation error for " +
|
|
|
|
|
string_human_readable_size(mem.memory_size()) +
|
|
|
|
|
", possibly caused by lack of available memory space on the device: " +
|
|
|
|
|
string_human_readable_size(stats.mem_used) + " of " +
|
|
|
|
|
string_human_readable_size(max_memory_on_device_) + " is already allocated");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
mem.device_pointer = reinterpret_cast<ccl::device_ptr>(device_pointer);
|
|
|
|
|
mem.device_size = memory_size;
|
|
|
|
|
|
|
|
|
|
stats.mem_alloc(memory_size);
|
|
|
|
|
free = get_free_mem();
|
|
|
|
|
total = max_memory_on_device_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OneapiDevice::generic_copy_to(device_memory &mem)
|
|
|
|
|
bool OneapiDevice::alloc_device(void *&device_pointer, size_t size)
|
|
|
|
|
{
|
|
|
|
|
if (!mem.device_pointer) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
size_t memory_size = mem.memory_size();
|
|
|
|
|
bool allocation_success = false;
|
|
|
|
|
device_pointer = usm_alloc_device(device_queue_, size);
|
|
|
|
|
if (device_pointer != nullptr) {
|
|
|
|
|
allocation_success = true;
|
|
|
|
|
/* Due to lazy memory initialisation in GPU runtime we will force memory to
|
|
|
|
|
* appear in device memory via execution of a kernel using this memory.. */
|
|
|
|
|
if (!oneapi_zero_memory_on_device(device_queue_, device_pointer, size)) {
|
|
|
|
|
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
|
|
|
|
|
"\"");
|
|
|
|
|
usm_free(device_queue_, device_pointer);
|
|
|
|
|
|
|
|
|
|
/* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
|
|
|
|
|
assert(mem.host_pointer);
|
|
|
|
|
assert(device_queue_);
|
|
|
|
|
usm_memcpy(device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
|
|
|
|
|
device_pointer = nullptr;
|
|
|
|
|
allocation_success = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return allocation_success;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OneapiDevice::free_device(void *device_pointer)
|
|
|
|
|
{
|
|
|
|
|
usm_free(device_queue_, device_pointer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool OneapiDevice::alloc_host(void *&shared_pointer, size_t size)
|
|
|
|
|
{
|
|
|
|
|
shared_pointer = usm_aligned_alloc_host(device_queue_, size, 64);
|
|
|
|
|
return shared_pointer != nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OneapiDevice::free_host(void *shared_pointer)
|
|
|
|
|
{
|
|
|
|
|
usm_free(device_queue_, shared_pointer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OneapiDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
|
|
|
|
|
{
|
|
|
|
|
/* Device and host pointer are in the same address space
|
|
|
|
|
* as we're using Unified Shared Memory. */
|
|
|
|
|
device_pointer = shared_pointer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OneapiDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
|
|
|
|
|
{
|
|
|
|
|
usm_memcpy(device_queue_, device_pointer, host_pointer, size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
|
|
|
|
@ -288,20 +371,6 @@ void *OneapiDevice::kernel_globals_device_pointer()
|
|
|
|
|
return kg_memory_device_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OneapiDevice::generic_free(device_memory &mem)
|
|
|
|
|
{
|
|
|
|
|
if (!mem.device_pointer) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
stats.mem_free(mem.device_size);
|
|
|
|
|
mem.device_size = 0;
|
|
|
|
|
|
|
|
|
|
assert(device_queue_);
|
|
|
|
|
usm_free(device_queue_, (void *)mem.device_pointer);
|
|
|
|
|
mem.device_pointer = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OneapiDevice::mem_alloc(device_memory &mem)
|
|
|
|
|
{
|
|
|
|
|
if (mem.type == MEM_TEXTURE) {
|
|
|
|
@ -344,7 +413,7 @@ void OneapiDevice::mem_copy_to(device_memory &mem)
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
if (!mem.device_pointer)
|
|
|
|
|
mem_alloc(mem);
|
|
|
|
|
generic_alloc(mem);
|
|
|
|
|
|
|
|
|
|
generic_copy_to(mem);
|
|
|
|
|
}
|
|
|
|
@ -515,14 +584,14 @@ void OneapiDevice::tex_alloc(device_texture &mem)
|
|
|
|
|
|
|
|
|
|
/* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */
|
|
|
|
|
const uint slot = mem.slot;
|
|
|
|
|
if (slot >= texture_info_.size()) {
|
|
|
|
|
texture_info_.resize(slot + 128);
|
|
|
|
|
if (slot >= texture_info.size()) {
|
|
|
|
|
texture_info.resize(slot + 128);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
texture_info_[slot] = mem.info;
|
|
|
|
|
need_texture_info_ = true;
|
|
|
|
|
texture_info[slot] = mem.info;
|
|
|
|
|
need_texture_info = true;
|
|
|
|
|
|
|
|
|
|
texture_info_[slot].data = (uint64_t)mem.device_pointer;
|
|
|
|
|
texture_info[slot].data = (uint64_t)mem.device_pointer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void OneapiDevice::tex_free(device_texture &mem)
|
|
|
|
@ -628,6 +697,16 @@ void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
|
|
|
|
|
{
|
|
|
|
|
assert(queue_);
|
|
|
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
|
|
|
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
|
|
|
|
|
* and shared. For new project it could more beneficial to use USM shared memory, because it
|
|
|
|
|
* provides automatic migration mechanism in order to allow to use the same pointer on host and
|
|
|
|
|
* on device, without need to worry about explicit memory transfer operations, although usage of
|
|
|
|
|
* USM shared imply some documented limitations on the memory usage in regards of parallel access
|
|
|
|
|
* from differen threads. But for Blender/Cycles this type of memory is not very suitable in
|
|
|
|
|
* current application architecture, because Cycles is multithread application and already uses
|
|
|
|
|
* two different pointer for host activity and device activity, and also has to perform all
|
|
|
|
|
* needed memory transfer operations. So, USM device memory type has been used for oneAPI device
|
|
|
|
|
* in order to better fit in Cycles architecture. */
|
|
|
|
|
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
|
|
|
|
|
return sycl::malloc_device(memory_size, *queue);
|
|
|
|
|
# else
|
|
|
|
@ -646,9 +725,26 @@ void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
|
|
|
|
|
bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
|
|
|
|
|
{
|
|
|
|
|
assert(queue_);
|
|
|
|
|
/* sycl::queue::memcpy may crash if the queue is in an invalid state due to previous
|
|
|
|
|
* runtime errors. It's better to avoid running memory operations in that case.
|
|
|
|
|
* The render will be canceled and the queue will be destroyed anyway. */
|
|
|
|
|
if (have_error())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
|
|
|
OneapiDevice::check_usm(queue_, dest, true);
|
|
|
|
|
OneapiDevice::check_usm(queue_, src, true);
|
|
|
|
|
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
|
|
|
|
|
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
|
|
|
|
|
/* Unknown here means, that this is not an USM allocation, which implies that this is
|
|
|
|
|
* some generic C++ allocation, so we could use C++ memcpy directly with USM host. */
|
|
|
|
|
if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) &&
|
|
|
|
|
(src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown))
|
|
|
|
|
{
|
|
|
|
|
memcpy(dest, src, num_bytes);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
|
|
|
|
|
# ifdef WITH_CYCLES_DEBUG
|
|
|
|
@ -658,8 +754,6 @@ bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t n
|
|
|
|
|
mem_event.wait_and_throw();
|
|
|
|
|
return true;
|
|
|
|
|
# else
|
|
|
|
|
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
|
|
|
|
|
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
|
|
|
|
|
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
|
|
|
|
|
src_type == sycl::usm::alloc::device;
|
|
|
|
|
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
|
|
|
|
@ -684,6 +778,12 @@ bool OneapiDevice::usm_memset(SyclQueue *queue_,
|
|
|
|
|
size_t num_bytes)
|
|
|
|
|
{
|
|
|
|
|
assert(queue_);
|
|
|
|
|
/* sycl::queue::memset may crash if the queue is in an invalid state due to previous
|
|
|
|
|
* runtime errors. It's better to avoid running memory operations in that case.
|
|
|
|
|
* The render will be canceled and the queue will be destroyed anyway. */
|
|
|
|
|
if (have_error())
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
|
|
|
|
|
OneapiDevice::check_usm(queue_, usm_ptr, true);
|
|
|
|
|
try {
|
|
|
|
@ -735,7 +835,7 @@ void OneapiDevice::set_global_memory(SyclQueue *queue_,
|
|
|
|
|
assert(memory_name);
|
|
|
|
|
assert(memory_device_pointer);
|
|
|
|
|
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
|
|
|
|
|
OneapiDevice::check_usm(queue_, memory_device_pointer);
|
|
|
|
|
OneapiDevice::check_usm(queue_, memory_device_pointer, true);
|
|
|
|
|
OneapiDevice::check_usm(queue_, kernel_globals, true);
|
|
|
|
|
|
|
|
|
|
std::string matched_name(memory_name);
|
|
|
|
@ -874,11 +974,11 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
|
|
|
|
|
|
|
|
|
|
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
|
|
|
|
|
* since Windows driver 101.3268. */
|
|
|
|
|
static const int lowest_supported_driver_version_win = 1015186;
|
|
|
|
|
static const int lowest_supported_driver_version_win = 1015518;
|
|
|
|
|
# ifdef _WIN32
|
|
|
|
|
/* For Windows driver 101.5186, compute-runtime version is 28044.
|
|
|
|
|
/* For Windows driver 101.5518, compute-runtime version is 28044.
|
|
|
|
|
* This information is returned by `ocloc query OCL_DRIVER_VERSION`.*/
|
|
|
|
|
static const int lowest_supported_driver_version_neo = 28044;
|
|
|
|
|
static const int lowest_supported_driver_version_neo = 29283;
|
|
|
|
|
# else
|
|
|
|
|
static const int lowest_supported_driver_version_neo = 27642;
|
|
|
|
|
# endif
|
|
|
|
|
Use
const vector<RTCScene>&