This repository has been archived on 2023-10-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
blender-archive/intern/cycles/kernel/device/oneapi/kernel.cpp
Xavier Hallade 125ac1f914 Cycles: increase min-supported driver version for Intel GPUs
Windows drivers 101.3430 fix an important GUI-related crash and it's
best to prevent users from running into it.
Linux drivers weren't affected but still had relevant gpu binary
compatibility fixes, so it makes sense to keep the min-supported version
aligned across OSes.
2022-09-26 07:41:47 -07:00

930 lines
33 KiB
C++

/* SPDX-License-Identifier: Apache-2.0
* Copyright 2021-2022 Intel Corporation */
#ifdef WITH_ONEAPI
/* clang-format off */
# include "kernel.h"
# include <iostream>
# include <map>
# include <set>
# include <CL/sycl.hpp>
# include "kernel/device/oneapi/compat.h"
# include "kernel/device/oneapi/globals.h"
# include "kernel/device/oneapi/kernel_templates.h"
# include "kernel/device/gpu/kernel.h"
/* clang-format on */
static OneAPIErrorCallback s_error_cb = nullptr;
static void *s_error_user_ptr = nullptr;
static std::vector<sycl::device> oneapi_available_devices();
void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
{
s_error_cb = cb;
s_error_user_ptr = user_ptr;
}
void oneapi_check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_host = false)
{
# ifdef _DEBUG
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
sycl::info::device_type device_type =
queue->get_device().get_info<sycl::info::device::device_type>();
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
(void)usm_type;
assert(usm_type == sycl::usm::alloc::device ||
((device_type == sycl::info::device_type::host ||
device_type == sycl::info::device_type::is_cpu || allow_host) &&
usm_type == sycl::usm::alloc::host));
# endif
}
bool oneapi_create_queue(SyclQueue *&external_queue, int device_index)
{
bool finished_correct = true;
try {
std::vector<sycl::device> devices = oneapi_available_devices();
if (device_index < 0 || device_index >= devices.size()) {
return false;
}
sycl::queue *created_queue = new sycl::queue(devices[device_index],
sycl::property::queue::in_order());
external_queue = reinterpret_cast<SyclQueue *>(created_queue);
}
catch (sycl::exception const &e) {
finished_correct = false;
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
}
return finished_correct;
}
void oneapi_free_queue(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
delete queue;
}
void *oneapi_usm_aligned_alloc_host(SyclQueue *queue_, size_t memory_size, size_t alignment)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
return sycl::aligned_alloc_host(alignment, memory_size, *queue);
}
void *oneapi_usm_alloc_device(SyclQueue *queue_, size_t memory_size)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
return sycl::malloc_device(memory_size, *queue);
}
void oneapi_usm_free(SyclQueue *queue_, void *usm_ptr)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
oneapi_check_usm(queue_, usm_ptr, true);
sycl::free(usm_ptr, *queue);
}
bool oneapi_usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
oneapi_check_usm(queue_, dest, true);
oneapi_check_usm(queue_, src, true);
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
# ifdef WITH_CYCLES_DEBUG
try {
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
*/
mem_event.wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
# else
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
src_type == sycl::usm::alloc::device;
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
src_type == sycl::usm::alloc::unknown;
/* NOTE(@sirgienko) Host-side blocking wait on this operation is mandatory, otherwise the host
* may not wait until the end of the transfer before using the memory.
*/
if (from_device_to_host || host_or_device_memop_with_offset)
mem_event.wait();
return true;
# endif
}
bool oneapi_usm_memset(SyclQueue *queue_, void *usm_ptr, unsigned char value, size_t num_bytes)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
oneapi_check_usm(queue_, usm_ptr, true);
sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
# ifdef WITH_CYCLES_DEBUG
try {
/* NOTE(@nsirgien) Waiting on memory operation may give more precise error
* messages. Due to impact on occupancy, it makes sense to enable it only during Cycles debug.
*/
mem_event.wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
# else
(void)mem_event;
return true;
# endif
}
bool oneapi_queue_synchronize(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
try {
queue->wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
}
/* NOTE(@nsirgien): Execution of this simple kernel will check basic functionality and
* also trigger runtime compilation of all existing oneAPI kernels */
bool oneapi_run_test_kernel(SyclQueue *queue_)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
size_t N = 8;
sycl::buffer<float, 1> A(N);
sycl::buffer<float, 1> B(N);
{
sycl::host_accessor A_host_acc(A, sycl::write_only);
for (size_t i = (size_t)0; i < N; i++)
A_host_acc[i] = rand() % 32;
}
try {
queue->submit([&](sycl::handler &cgh) {
sycl::accessor A_acc(A, cgh, sycl::read_only);
sycl::accessor B_acc(B, cgh, sycl::write_only, sycl::no_init);
cgh.parallel_for(N, [=](sycl::id<1> idx) { B_acc[idx] = A_acc[idx] + idx.get(0); });
});
queue->wait_and_throw();
sycl::host_accessor A_host_acc(A, sycl::read_only);
sycl::host_accessor B_host_acc(B, sycl::read_only);
for (size_t i = (size_t)0; i < N; i++) {
float result = A_host_acc[i] + B_host_acc[i];
(void)result;
}
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
return true;
}
bool oneapi_kernel_globals_size(SyclQueue *queue_, size_t &kernel_global_size)
{
kernel_global_size = sizeof(KernelGlobalsGPU);
return true;
}
void oneapi_set_global_memory(SyclQueue *queue_,
void *kernel_globals,
const char *memory_name,
void *memory_device_pointer)
{
assert(queue_);
assert(kernel_globals);
assert(memory_name);
assert(memory_device_pointer);
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
oneapi_check_usm(queue_, memory_device_pointer);
oneapi_check_usm(queue_, kernel_globals, true);
std::string matched_name(memory_name);
/* This macro will change global ptr of KernelGlobals via name matching. */
# define KERNEL_DATA_ARRAY(type, name) \
else if (#name == matched_name) \
{ \
globals->__##name = (type *)memory_device_pointer; \
return; \
}
if (false) {
}
else if ("integrator_state" == matched_name) {
globals->integrator_state = (IntegratorStateGPU *)memory_device_pointer;
return;
}
KERNEL_DATA_ARRAY(KernelData, data)
# include "kernel/data_arrays.h"
else
{
std::cerr << "Can't found global/constant memory with name \"" << matched_name << "\"!"
<< std::endl;
assert(false);
}
# undef KERNEL_DATA_ARRAY
}
/* TODO: Move device information to OneapiDevice initialized on creation and use it. */
/* TODO: Move below function to oneapi/queue.cpp. */
size_t oneapi_kernel_preferred_local_size(SyclQueue *queue_,
const DeviceKernel kernel,
const size_t kernel_global_size)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
(void)kernel_global_size;
const static size_t preferred_work_group_size_intersect_shading = 32;
const static size_t preferred_work_group_size_technical = 1024;
size_t preferred_work_group_size = 0;
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA:
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE:
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK:
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND:
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
preferred_work_group_size = preferred_work_group_size_intersect_shading;
break;
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES:
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES:
case DEVICE_KERNEL_INTEGRATOR_RESET:
case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS:
preferred_work_group_size = preferred_work_group_size_technical;
break;
default:
preferred_work_group_size = 512;
}
const size_t limit_work_group_size =
queue->get_device().get_info<sycl::info::device::max_work_group_size>();
return std::min(limit_work_group_size, preferred_work_group_size);
}
bool oneapi_enqueue_kernel(KernelContext *kernel_context,
int kernel,
size_t global_size,
void **args)
{
bool success = true;
::DeviceKernel device_kernel = (::DeviceKernel)kernel;
KernelGlobalsGPU *kg = (KernelGlobalsGPU *)kernel_context->kernel_globals;
sycl::queue *queue = reinterpret_cast<sycl::queue *>(kernel_context->queue);
assert(queue);
if (!queue) {
return false;
}
size_t local_size = oneapi_kernel_preferred_local_size(
kernel_context->queue, device_kernel, global_size);
assert(global_size % local_size == 0);
/* Local size for DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY needs to be enforced so we
* overwrite it outside of oneapi_kernel_preferred_local_size. */
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY) {
local_size = GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE;
}
/* Kernels listed below need a specific number of work groups. */
if (device_kernel == DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY ||
device_kernel == DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY) {
int num_states = *((int *)(args[0]));
/* Round up to the next work-group. */
size_t groups_count = (num_states + local_size - 1) / local_size;
/* NOTE(@nsirgien): As for now non-uniform work-groups don't work on most oneAPI devices,
* we extend work size to fit uniformity requirements. */
global_size = groups_count * local_size;
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
if (queue->get_device().is_host()) {
global_size = 1;
local_size = 1;
}
# endif
}
/* Let the compiler throw an error if there are any kernels missing in this implementation. */
# if defined(_WIN32)
# pragma warning(error : 4062)
# elif defined(__GNUC__)
# pragma GCC diagnostic push
# pragma GCC diagnostic error "-Wswitch"
# endif
try {
queue->submit([&](sycl::handler &cgh) {
switch (device_kernel) {
case DEVICE_KERNEL_INTEGRATOR_RESET: {
oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset);
break;
}
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_camera);
break;
}
case DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_init_from_bake);
break;
}
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_closest);
break;
}
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_intersect_shadow);
break;
}
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_intersect_subsurface);
break;
}
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_intersect_volume_stack);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_background);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_light);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_shadow);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_shade_surface_raytrace);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_surface_mnee);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_shade_volume);
break;
}
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_queued_paths_array);
break;
}
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_queued_shadow_paths_array);
break;
}
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_active_paths_array);
break;
}
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_terminated_paths_array);
break;
}
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_terminated_shadow_paths_array);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_sorted_paths_array);
break;
}
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_compact_paths_array);
break;
}
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_compact_shadow_paths_array);
break;
}
case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_adaptive_sampling_convergence_check);
break;
}
case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_x);
break;
}
case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_adaptive_sampling_filter_y);
break;
}
case DEVICE_KERNEL_SHADER_EVAL_DISPLACE: {
oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_displace);
break;
}
case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_shader_eval_background);
break;
}
case DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_shader_eval_curve_shadow_transparency);
break;
}
case DEVICE_KERNEL_PREFIX_SUM: {
oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_prefix_sum);
break;
}
/* clang-format off */
# define DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
case DEVICE_KERNEL_FILM_CONVERT_##VARIANT: { \
oneapi_call(kg, cgh, \
global_size, \
local_size, \
args, \
oneapi_kernel_film_convert_##variant); \
break; \
}
# define DEVICE_KERNEL_FILM_CONVERT(variant, VARIANT) \
DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT, variant) \
DEVICE_KERNEL_FILM_CONVERT_PARTIAL(VARIANT##_HALF_RGBA, variant##_half_rgba)
DEVICE_KERNEL_FILM_CONVERT(depth, DEPTH);
DEVICE_KERNEL_FILM_CONVERT(mist, MIST);
DEVICE_KERNEL_FILM_CONVERT(sample_count, SAMPLE_COUNT);
DEVICE_KERNEL_FILM_CONVERT(float, FLOAT);
DEVICE_KERNEL_FILM_CONVERT(light_path, LIGHT_PATH);
DEVICE_KERNEL_FILM_CONVERT(float3, FLOAT3);
DEVICE_KERNEL_FILM_CONVERT(motion, MOTION);
DEVICE_KERNEL_FILM_CONVERT(cryptomatte, CRYPTOMATTE);
DEVICE_KERNEL_FILM_CONVERT(shadow_catcher, SHADOW_CATCHER);
DEVICE_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow,
SHADOW_CATCHER_MATTE_WITH_SHADOW);
DEVICE_KERNEL_FILM_CONVERT(combined, COMBINED);
DEVICE_KERNEL_FILM_CONVERT(float4, FLOAT4);
# undef DEVICE_KERNEL_FILM_CONVERT
# undef DEVICE_KERNEL_FILM_CONVERT_PARTIAL
/* clang-format on */
case DEVICE_KERNEL_FILTER_GUIDING_PREPROCESS: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_filter_guiding_preprocess);
break;
}
case DEVICE_KERNEL_FILTER_GUIDING_SET_FAKE_ALBEDO: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_filter_guiding_set_fake_albedo);
break;
}
case DEVICE_KERNEL_FILTER_COLOR_PREPROCESS: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_preprocess);
break;
}
case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_filter_color_postprocess);
break;
}
case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_cryptomatte_postprocess);
break;
}
case DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES: {
oneapi_call(
kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_compact_states);
break;
}
case DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_compact_shadow_states);
break;
}
case DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS: {
oneapi_call(kg,
cgh,
global_size,
local_size,
args,
oneapi_kernel_integrator_shadow_catcher_count_possible_splits);
break;
}
/* Unsupported kernels */
case DEVICE_KERNEL_NUM:
case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
kernel_assert(0);
break;
}
});
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
success = false;
}
}
# if defined(_WIN32)
# pragma warning(default : 4062)
# elif defined(__GNUC__)
# pragma GCC diagnostic pop
# endif
return success;
}
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
* since Windows driver 101.3268. */
/* The same min compute-runtime version is currently required across Windows and Linux.
* For Windows driver 101.3430, compute-runtime version is 23904. */
static const int lowest_supported_driver_version_win = 1013430;
static const int lowest_supported_driver_version_neo = 23904;
static int parse_driver_build_version(const sycl::device &device)
{
const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
int driver_build_version = 0;
size_t second_dot_position = driver_version.find('.', driver_version.find('.') + 1);
if (second_dot_position == std::string::npos) {
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
<< " xx.xx.xxx.xxxx (Windows) for device \""
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
}
else {
try {
size_t third_dot_position = driver_version.find('.', second_dot_position + 1);
if (third_dot_position != std::string::npos) {
const std::string &third_number_substr = driver_version.substr(
second_dot_position + 1, third_dot_position - second_dot_position - 1);
const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
if (third_number_substr.length() == 3 && forth_number_substr.length() == 4)
driver_build_version = std::stoi(third_number_substr) * 10000 +
std::stoi(forth_number_substr);
}
else {
const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
driver_build_version = std::stoi(third_number_substr);
}
}
catch (std::invalid_argument &e) {
std::cerr << "Unable to parse unknown Intel GPU driver version \"" << driver_version
<< "\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
<< " xx.xx.xxx.xxxx (Windows) for device \""
<< device.get_info<sycl::info::device::name>() << "\"." << std::endl;
}
}
return driver_build_version;
}
static std::vector<sycl::device> oneapi_available_devices()
{
bool allow_all_devices = false;
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") != nullptr)
allow_all_devices = true;
/* Host device is useful only for debugging at the moment
* so we hide this device with default build settings. */
# ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
bool allow_host = true;
# else
bool allow_host = false;
# endif
const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
std::vector<sycl::device> available_devices;
for (const sycl::platform &platform : oneapi_platforms) {
/* ignore OpenCL platforms to avoid using the same devices through both Level-Zero and OpenCL.
*/
if (platform.get_backend() == sycl::backend::opencl) {
continue;
}
const std::vector<sycl::device> &oneapi_devices =
(allow_all_devices || allow_host) ? platform.get_devices(sycl::info::device_type::all) :
platform.get_devices(sycl::info::device_type::gpu);
for (const sycl::device &device : oneapi_devices) {
if (allow_all_devices) {
/* still filter out host device if build doesn't support it. */
if (allow_host || !device.is_host()) {
available_devices.push_back(device);
}
}
else {
bool filter_out = false;
/* For now we support all Intel(R) Arc(TM) devices and likely any future GPU,
* assuming they have either more than 96 Execution Units or not 7 threads per EU.
* Official support can be broaden to older and smaller GPUs once ready. */
if (device.is_gpu() && platform.get_backend() == sycl::backend::ext_oneapi_level_zero) {
/* Filtered-out defaults in-case these values aren't available through too old L0
* runtime. */
int number_of_eus = 96;
int threads_per_eu = 7;
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
number_of_eus = device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
}
if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
threads_per_eu =
device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
}
/* This filters out all Level-Zero supported GPUs from older generation than Arc. */
if (number_of_eus <= 96 && threads_per_eu == 7) {
filter_out = true;
}
/* if not already filtered out, check driver version. */
if (!filter_out) {
int driver_build_version = parse_driver_build_version(device);
if ((driver_build_version > 100000 &&
driver_build_version < lowest_supported_driver_version_win) ||
driver_build_version < lowest_supported_driver_version_neo) {
filter_out = true;
}
}
}
else if (!allow_host && device.is_host()) {
filter_out = true;
}
else if (!allow_all_devices) {
filter_out = true;
}
if (!filter_out) {
available_devices.push_back(device);
}
}
}
}
return available_devices;
}
char *oneapi_device_capabilities()
{
std::stringstream capabilities;
const std::vector<sycl::device> &oneapi_devices = oneapi_available_devices();
for (const sycl::device &device : oneapi_devices) {
const std::string &name = device.get_info<sycl::info::device::name>();
capabilities << std::string("\t") << name << "\n";
# define WRITE_ATTR(attribute_name, attribute_variable) \
capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
<< "\n";
# define GET_NUM_ATTR(attribute) \
{ \
size_t attribute = (size_t)device.get_info<sycl::info::device ::attribute>(); \
capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" << attribute << "\n"; \
}
GET_NUM_ATTR(vendor_id)
GET_NUM_ATTR(max_compute_units)
GET_NUM_ATTR(max_work_item_dimensions)
sycl::id<3> max_work_item_sizes =
device.get_info<sycl::info::device::max_work_item_sizes<3>>();
WRITE_ATTR("max_work_item_sizes_dim0", ((size_t)max_work_item_sizes.get(0)))
WRITE_ATTR("max_work_item_sizes_dim1", ((size_t)max_work_item_sizes.get(1)))
WRITE_ATTR("max_work_item_sizes_dim2", ((size_t)max_work_item_sizes.get(2)))
GET_NUM_ATTR(max_work_group_size)
GET_NUM_ATTR(max_num_sub_groups)
GET_NUM_ATTR(sub_group_independent_forward_progress)
GET_NUM_ATTR(preferred_vector_width_char)
GET_NUM_ATTR(preferred_vector_width_short)
GET_NUM_ATTR(preferred_vector_width_int)
GET_NUM_ATTR(preferred_vector_width_long)
GET_NUM_ATTR(preferred_vector_width_float)
GET_NUM_ATTR(preferred_vector_width_double)
GET_NUM_ATTR(preferred_vector_width_half)
GET_NUM_ATTR(native_vector_width_char)
GET_NUM_ATTR(native_vector_width_short)
GET_NUM_ATTR(native_vector_width_int)
GET_NUM_ATTR(native_vector_width_long)
GET_NUM_ATTR(native_vector_width_float)
GET_NUM_ATTR(native_vector_width_double)
GET_NUM_ATTR(native_vector_width_half)
size_t max_clock_frequency =
(size_t)(device.is_host() ? (size_t)0 :
device.get_info<sycl::info::device::max_clock_frequency>());
WRITE_ATTR("max_clock_frequency", max_clock_frequency)
GET_NUM_ATTR(address_bits)
GET_NUM_ATTR(max_mem_alloc_size)
/* NOTE(@nsirgien): Implementation doesn't use image support as bindless images aren't
* supported so we always return false, even if device supports HW texture usage acceleration.
*/
bool image_support = false;
WRITE_ATTR("image_support", (size_t)image_support)
GET_NUM_ATTR(max_parameter_size)
GET_NUM_ATTR(mem_base_addr_align)
GET_NUM_ATTR(global_mem_size)
GET_NUM_ATTR(local_mem_size)
GET_NUM_ATTR(error_correction_support)
GET_NUM_ATTR(profiling_timer_resolution)
GET_NUM_ATTR(is_available)
# undef GET_NUM_ATTR
# undef WRITE_ATTR
capabilities << "\n";
}
return ::strdup(capabilities.str().c_str());
}
void oneapi_free(void *p)
{
if (p) {
::free(p);
}
}
void oneapi_iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_ptr)
{
int num = 0;
std::vector<sycl::device> devices = oneapi_available_devices();
for (sycl::device &device : devices) {
const std::string &platform_name =
device.get_platform().get_info<sycl::info::platform::name>();
std::string name = device.get_info<sycl::info::device::name>();
std::string id = "ONEAPI_" + platform_name + "_" + name;
if (device.has(sycl::aspect::ext_intel_pci_address)) {
id.append("_" + device.get_info<sycl::info::device::ext_intel_pci_address>());
}
(cb)(id.c_str(), name.c_str(), num, user_ptr);
num++;
}
}
size_t oneapi_get_memcapacity(SyclQueue *queue)
{
return reinterpret_cast<sycl::queue *>(queue)
->get_device()
.get_info<sycl::info::device::global_mem_size>();
}
int oneapi_get_num_multiprocessors(SyclQueue *queue)
{
const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
return device.get_info<sycl::info::device::ext_intel_gpu_eu_count>();
}
else
return 0;
}
int oneapi_get_max_num_threads_per_multiprocessor(SyclQueue *queue)
{
const sycl::device &device = reinterpret_cast<sycl::queue *>(queue)->get_device();
if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
return device.get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>() *
device.get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>();
}
else
return 0;
}
#endif /* WITH_ONEAPI */