Cycles: oneAPI: Enable host memory migration #122385

Merged
Nikita Sirgienko merged 2 commits from Sirgienko/blender:oneapi_enable_host_migraiton into main 2024-05-28 19:04:30 +02:00
8 changed files with 267 additions and 85 deletions

View File

@ -1683,7 +1683,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
elif device_type == 'ONEAPI':
import sys
if sys.platform.startswith("win"):
driver_version = "XX.X.101.5186"
driver_version = "XX.X.101.5518"
col.label(text=rpt_("Requires Intel GPU with Xe-HPG architecture"), icon='BLANK1', translate=False)
col.label(text=rpt_("and Windows driver version %s or newer") % driver_version,
icon='BLANK1', translate=False)

View File

@ -178,6 +178,51 @@ void BVHEmbree::build(Progress &progress,
rtcCommitScene(scene);
}
string BVHEmbree::get_last_error_message()
{
const RTCError error_code = rtcGetDeviceError(rtc_device);
switch (error_code) {
case RTC_ERROR_NONE:
return "no error";
case RTC_ERROR_UNKNOWN:
return "unknown error";
case RTC_ERROR_INVALID_ARGUMENT:
return "invalid argument error";
case RTC_ERROR_INVALID_OPERATION:
return "invalid operation error";
case RTC_ERROR_OUT_OF_MEMORY:
return "out of memory error";
case RTC_ERROR_UNSUPPORTED_CPU:
return "unsupported cpu error";
case RTC_ERROR_CANCELLED:
return "cancelled";
default:
/* We should never end here unless enum for RTC errors would change. */
return "unknown error";
}
}
Sirgienko marked this conversation as resolved

Use const vector<RTCScene>&

Use `const vector<RTCScene>&`
# if WITH_EMBREE_GPU && RTC_VERSION >= 40302
bool BVHEmbree::offload_scenes_to_gpu(const vector<RTCScene> &scenes)
{
/* Having BVH on GPU is more performance-critical than texture data.
* In order to ensure good performance even when running out of GPU
* memory, we force BVH to migrate to GPU before allocating other textures
* that may not fit. */
for (const RTCScene &embree_scene : scenes) {
RTCSceneFlags scene_flags = rtcGetSceneFlags(embree_scene);
scene_flags = scene_flags | RTC_SCENE_FLAG_PREFETCH_USM_SHARED_ON_GPU;
rtcSetSceneFlags(embree_scene, scene_flags);
rtcCommitScene(embree_scene);
/* In case of any errors from Embree, we should stop
* the execution and propagate the error. */
if (rtcGetDeviceError(rtc_device) != RTC_ERROR_NONE)
return false;
}
return true;
}
# endif
void BVHEmbree::add_object(Object *ob, int i)
{
Geometry *geom = ob->get_geometry();

View File

@ -18,6 +18,7 @@
# include "bvh/bvh.h"
# include "bvh/params.h"
# include "util/string.h"
# include "util/thread.h"
# include "util/types.h"
# include "util/vector.h"
@ -36,6 +37,12 @@ class BVHEmbree : public BVH {
const bool isSyclEmbreeDevice = false);
void refit(Progress &progress);
# if WITH_EMBREE_GPU && RTC_VERSION >= 40302
bool offload_scenes_to_gpu(const vector<RTCScene> &scenes);
# endif
string get_last_error_message();
RTCScene scene;
protected:

View File

@ -257,6 +257,7 @@ class device_memory {
friend class OptiXDevice;
friend class HIPDevice;
friend class MetalDevice;
friend class OneapiDevice;
/* Only create through subclasses. */
device_memory(Device *device, const char *name, MemoryType type);

View File

@ -11,6 +11,7 @@
# include "device/oneapi/device_impl.h"
# include "util/debug.h"
# include "util/foreach.h"
# include "util/log.h"
# ifdef WITH_EMBREE_GPU
@ -47,18 +48,20 @@ static void queue_error_cb(const char *message, void *user_ptr)
}
OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler),
: GPUDevice(info, stats, profiler),
device_queue_(nullptr),
# ifdef WITH_EMBREE_GPU
embree_device(nullptr),
embree_scene(nullptr),
# endif
texture_info_(this, "texture_info", MEM_GLOBAL),
kg_memory_(nullptr),
kg_memory_device_(nullptr),
kg_memory_size_(0)
{
need_texture_info_ = false;
/* Verify that base class types can be used with specific backend types */
static_assert(sizeof(texMemObject) == sizeof(void *));
static_assert(sizeof(arrayMemObject) == sizeof(void *));
use_hardware_raytracing = info.use_hardware_raytracing;
oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
@ -110,6 +113,18 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
kg_memory_size_ = globals_segment_size;
max_memory_on_device_ = get_memcapacity();
init_host_memory();
move_texture_to_host = false;
can_map_host = true;
const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM");
if (headroom_str != nullptr) {
const long long override_headroom = (float)atoll(headroom_str);
device_working_headroom = override_headroom;
device_texture_headroom = override_headroom;
}
VLOG_DEBUG << "oneAPI memory headroom size: "
<< string_human_readable_size(device_working_headroom);
}
OneapiDevice::~OneapiDevice()
@ -119,7 +134,7 @@ OneapiDevice::~OneapiDevice()
rtcReleaseDevice(embree_device);
# endif
texture_info_.free();
texture_info.free();
usm_free(device_queue_, kg_memory_);
usm_free(device_queue_, kg_memory_device_);
@ -166,8 +181,22 @@ void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
else {
bvh_embree->build(progress, &stats, embree_device, true);
}
# if RTC_VERSION >= 40302
thread_scoped_lock lock(scene_data_mutex);
all_embree_scenes.push_back(bvh_embree->scene);
# endif
if (bvh->params.top_level) {
embree_scene = bvh_embree->scene;
# if RTC_VERSION >= 40302
if (bvh_embree->offload_scenes_to_gpu(all_embree_scenes) == false) {
set_error(
string_printf("BVH failed to to migrate to the GPU due to Embree library error (%s)",
bvh_embree->get_last_error_message()));
}
all_embree_scenes.clear();
# endif
}
}
else {
@ -176,6 +205,22 @@ void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
}
# endif
size_t OneapiDevice::get_free_mem() const
{
/* Accurate: Use device info. */
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
if (device.has(sycl::aspect::ext_intel_free_memory)) {
return device.get_info<sycl::ext::intel::info::device::free_memory>();
}
/* Estimate: Capacity - in use. */
else if (device_mem_in_use < max_memory_on_device_) {
return max_memory_on_device_ - device_mem_in_use;
}
else {
return 0;
}
}
bool OneapiDevice::load_kernels(const uint requested_features)
{
assert(device_queue_);
@ -208,63 +253,101 @@ bool OneapiDevice::load_kernels(const uint requested_features)
VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\"";
}
if (is_finished_ok) {
reserve_private_memory(requested_features);
is_finished_ok = !have_error();
}
return is_finished_ok;
}
void OneapiDevice::load_texture_info()
void OneapiDevice::reserve_private_memory(const uint kernel_features)
{
if (need_texture_info_) {
need_texture_info_ = false;
texture_info_.copy_to_device();
size_t free_before = get_free_mem();
/* Use the biggest kernel for estimation. */
const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
(kernel_features & KERNEL_FEATURE_MNEE) ?
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE :
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
{
unique_ptr<DeviceQueue> queue = gpu_queue_create();
device_ptr d_path_index = 0;
device_ptr d_render_buffer = 0;
int d_work_size = 0;
DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
queue->init_execution();
/* Launch of the kernel seems to be sufficient to reserve all
* needed memory regardless of the execution global size.
* So, the smallest possible size is used here. */
queue->enqueue(test_kernel, 1, args);
queue->synchronize();
}
size_t free_after = get_free_mem();
VLOG_INFO << "For kernel execution were reserved "
<< string_human_readable_number(free_before - free_after) << " bytes. ("
<< string_human_readable_size(free_before - free_after) << ")";
}
void OneapiDevice::generic_alloc(device_memory &mem)
void OneapiDevice::get_device_memory_info(size_t &total, size_t &free)
{
size_t memory_size = mem.memory_size();
/* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then
* we can use USM host memory.
* Because of the expected performance impact, implementation of this has had a low priority
* and is not implemented yet. */
assert(device_queue_);
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
* and shared. For new project it maybe more beneficial to use USM shared memory, because it
* provides automatic migration mechanism in order to allow to use the same pointer on host and
* on device, without need to worry about explicit memory transfer operations. But for
* Blender/Cycles this type of memory is not very suitable in current application architecture,
* because Cycles already uses two different pointer for host activity and device activity, and
* also has to perform all needed memory transfer operations. So, USM device memory
* type has been used for oneAPI device in order to better fit in Cycles architecture. */
void *device_pointer = nullptr;
if (mem.memory_size() + stats.mem_used < max_memory_on_device_)
device_pointer = usm_alloc_device(device_queue_, memory_size);
if (device_pointer == nullptr) {
set_error("oneAPI kernel - device memory allocation error for " +
string_human_readable_size(mem.memory_size()) +
", possibly caused by lack of available memory space on the device: " +
string_human_readable_size(stats.mem_used) + " of " +
string_human_readable_size(max_memory_on_device_) + " is already allocated");
}
mem.device_pointer = reinterpret_cast<ccl::device_ptr>(device_pointer);
mem.device_size = memory_size;
stats.mem_alloc(memory_size);
free = get_free_mem();
total = max_memory_on_device_;
}
void OneapiDevice::generic_copy_to(device_memory &mem)
bool OneapiDevice::alloc_device(void *&device_pointer, size_t size)
{
if (!mem.device_pointer) {
return;
}
size_t memory_size = mem.memory_size();
bool allocation_success = false;
device_pointer = usm_alloc_device(device_queue_, size);
if (device_pointer != nullptr) {
allocation_success = true;
/* Due to lazy memory initialisation in GPU runtime we will force memory to
* appear in device memory via execution of a kernel using this memory.. */
if (!oneapi_zero_memory_on_device(device_queue_, device_pointer, size)) {
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
"\"");
usm_free(device_queue_, device_pointer);
/* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
assert(mem.host_pointer);
assert(device_queue_);
usm_memcpy(device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
device_pointer = nullptr;
allocation_success = false;
}
}
return allocation_success;
}
void OneapiDevice::free_device(void *device_pointer)
{
usm_free(device_queue_, device_pointer);
}
bool OneapiDevice::alloc_host(void *&shared_pointer, size_t size)
{
shared_pointer = usm_aligned_alloc_host(device_queue_, size, 64);
return shared_pointer != nullptr;
}
void OneapiDevice::free_host(void *shared_pointer)
{
usm_free(device_queue_, shared_pointer);
}
void OneapiDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
{
/* Device and host pointer are in the same address space
* as we're using Unified Shared Memory. */
device_pointer = shared_pointer;
}
void OneapiDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
{
usm_memcpy(device_queue_, device_pointer, host_pointer, size);
}
/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
@ -288,20 +371,6 @@ void *OneapiDevice::kernel_globals_device_pointer()
return kg_memory_device_;
}
void OneapiDevice::generic_free(device_memory &mem)
{
if (!mem.device_pointer) {
return;
}
stats.mem_free(mem.device_size);
mem.device_size = 0;
assert(device_queue_);
usm_free(device_queue_, (void *)mem.device_pointer);
mem.device_pointer = 0;
}
void OneapiDevice::mem_alloc(device_memory &mem)
{
if (mem.type == MEM_TEXTURE) {
@ -344,7 +413,7 @@ void OneapiDevice::mem_copy_to(device_memory &mem)
}
else {
if (!mem.device_pointer)
mem_alloc(mem);
generic_alloc(mem);
generic_copy_to(mem);
}
@ -515,14 +584,14 @@ void OneapiDevice::tex_alloc(device_texture &mem)
/* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */
const uint slot = mem.slot;
if (slot >= texture_info_.size()) {
texture_info_.resize(slot + 128);
if (slot >= texture_info.size()) {
texture_info.resize(slot + 128);
}
texture_info_[slot] = mem.info;
need_texture_info_ = true;
texture_info[slot] = mem.info;
need_texture_info = true;
texture_info_[slot].data = (uint64_t)mem.device_pointer;
texture_info[slot].data = (uint64_t)mem.device_pointer;
}
void OneapiDevice::tex_free(device_texture &mem)
@ -628,6 +697,16 @@ void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
* and shared. For new project it could more beneficial to use USM shared memory, because it
* provides automatic migration mechanism in order to allow to use the same pointer on host and
* on device, without need to worry about explicit memory transfer operations, although usage of
* USM shared imply some documented limitations on the memory usage in regards of parallel access
* from differen threads. But for Blender/Cycles this type of memory is not very suitable in
* current application architecture, because Cycles is multithread application and already uses
* two different pointer for host activity and device activity, and also has to perform all
* needed memory transfer operations. So, USM device memory type has been used for oneAPI device
* in order to better fit in Cycles architecture. */
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
return sycl::malloc_device(memory_size, *queue);
# else
@ -646,9 +725,26 @@ void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
{
assert(queue_);
/* sycl::queue::memcpy may crash if the queue is in an invalid state due to previous
* runtime errors. It's better to avoid running memory operations in that case.
* The render will be canceled and the queue will be destroyed anyway. */
if (have_error())
return false;
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, dest, true);
OneapiDevice::check_usm(queue_, src, true);
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
/* Unknown here means, that this is not an USM allocation, which implies that this is
* some generic C++ allocation, so we could use C++ memcpy directly with USM host. */
if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) &&
(src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown))
{
memcpy(dest, src, num_bytes);
return true;
}
try {
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
# ifdef WITH_CYCLES_DEBUG
@ -658,8 +754,6 @@ bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t n
mem_event.wait_and_throw();
return true;
# else
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
src_type == sycl::usm::alloc::device;
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
@ -684,6 +778,12 @@ bool OneapiDevice::usm_memset(SyclQueue *queue_,
size_t num_bytes)
{
assert(queue_);
/* sycl::queue::memset may crash if the queue is in an invalid state due to previous
* runtime errors. It's better to avoid running memory operations in that case.
* The render will be canceled and the queue will be destroyed anyway. */
if (have_error())
return false;
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, usm_ptr, true);
try {
@ -735,7 +835,7 @@ void OneapiDevice::set_global_memory(SyclQueue *queue_,
assert(memory_name);
assert(memory_device_pointer);
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
OneapiDevice::check_usm(queue_, memory_device_pointer);
OneapiDevice::check_usm(queue_, memory_device_pointer, true);
OneapiDevice::check_usm(queue_, kernel_globals, true);
std::string matched_name(memory_name);
@ -874,11 +974,11 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
* since Windows driver 101.3268. */
static const int lowest_supported_driver_version_win = 1015186;
static const int lowest_supported_driver_version_win = 1015518;
# ifdef _WIN32
/* For Windows driver 101.5186, compute-runtime version is 28044.
/* For Windows driver 101.5518, compute-runtime version is 28044.
* This information is returned by `ocloc query OCL_DRIVER_VERSION`.*/
static const int lowest_supported_driver_version_neo = 28044;
static const int lowest_supported_driver_version_neo = 29283;
# else
static const int lowest_supported_driver_version_neo = 27642;
# endif

View File

@ -21,17 +21,19 @@ typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
bool oidn_support,
void *user_ptr);
class OneapiDevice : public Device {
class OneapiDevice : public GPUDevice {
private:
SyclQueue *device_queue_;
# ifdef WITH_EMBREE_GPU
RTCDevice embree_device;
RTCScene embree_scene;
# if RTC_VERSION >= 40302
thread_mutex scene_data_mutex;
vector<RTCScene> all_embree_scenes;
# endif
# endif
using ConstMemMap = map<string, device_vector<uchar> *>;
ConstMemMap const_mem_map_;
device_vector<TextureInfo> texture_info_;
bool need_texture_info_;
void *kg_memory_;
void *kg_memory_device_;
size_t kg_memory_size_ = (size_t)0;
@ -41,6 +43,8 @@ class OneapiDevice : public Device {
unsigned int kernel_features = 0;
int scene_max_shaders_ = 0;
size_t get_free_mem() const;
public:
virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override;
@ -54,13 +58,15 @@ class OneapiDevice : public Device {
bool load_kernels(const uint kernel_features) override;
void load_texture_info();
void reserve_private_memory(const uint kernel_features);
void generic_alloc(device_memory &mem);
void generic_copy_to(device_memory &mem);
void generic_free(device_memory &mem);
virtual void get_device_memory_info(size_t &total, size_t &free) override;
virtual bool alloc_device(void *&device_pointer, size_t size) override;
virtual void free_device(void *device_pointer) override;
virtual bool alloc_host(void *&shared_pointer, size_t size) override;
virtual void free_host(void *shared_pointer) override;
virtual void transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
string oneapi_error_message();

View File

@ -133,6 +133,26 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
return is_computation_correct;
}
bool oneapi_zero_memory_on_device(SyclQueue *queue_, void *device_pointer, size_t num_bytes)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
try {
queue->submit([&](sycl::handler &cgh) {
cgh.parallel_for(num_bytes,
[=](sycl::id<1> idx) { ((char *)device_pointer)[idx.get(0)] = (char)0; });
});
queue->wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
}
bool oneapi_kernel_is_required_for_features(const std::string &kernel_name,
const uint kernel_features)
{

View File

@ -44,6 +44,9 @@ extern "C" {
# endif
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_zero_memory_on_device(SyclQueue *queue_,
void *device_pointer,
size_t num_bytes);
CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_suggested_gpu_kernel_size(const DeviceKernel kernel);
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,