Cycles: add RT HW support through Embree for oneAPI backend #106266

Closed
Xavier Hallade wants to merge 10 commits from xavierh/blender:cycles_embree4_gpu into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
48 changed files with 1024 additions and 191 deletions

View File

@ -90,28 +90,26 @@ include(cmake/haru.cmake)
# Boost needs to be included after `python.cmake` due to the PYTHON_BINARY variable being needed.
include(cmake/boost.cmake)
include(cmake/pugixml.cmake)
include(cmake/ispc.cmake)
include(cmake/openimagedenoise.cmake)
include(cmake/embree.cmake)
include(cmake/openpgl.cmake)
include(cmake/fmt.cmake)
include(cmake/robinmap.cmake)
include(cmake/xml2.cmake)
include(cmake/fribidi.cmake)
include(cmake/harfbuzz.cmake)
if(NOT APPLE)
include(cmake/xr_openxr.cmake)
if(NOT WIN32 OR BUILD_MODE STREQUAL Release)
include(cmake/dpcpp.cmake)
include(cmake/dpcpp_deps.cmake)
endif()
include(cmake/dpcpp.cmake)
include(cmake/dpcpp_deps.cmake)
if(NOT WIN32)
include(cmake/igc.cmake)
include(cmake/gmmlib.cmake)
include(cmake/ocloc.cmake)
endif()
endif()
include(cmake/ispc.cmake)
include(cmake/openimagedenoise.cmake)
# Embree needs to be included after dpcpp as it uses it for compiling with GPU support
include(cmake/embree.cmake)
include(cmake/openpgl.cmake)
include(cmake/fmt.cmake)
include(cmake/robinmap.cmake)
include(cmake/xml2.cmake)
# OpenColorIO and dependencies.
include(cmake/expat.cmake)

View File

@ -156,6 +156,7 @@ download_source(OPENCLHEADERS)
download_source(ICDLOADER)
download_source(MP11)
download_source(SPIRV_HEADERS)
download_source(UNIFIED_RUNTIME)
download_source(IGC)
download_source(IGC_LLVM)
download_source(IGC_OPENCL_CLANG)

View File

@ -5,6 +5,9 @@
# for now.
string(REPLACE "-DCMAKE_CXX_STANDARD=17" " " DPCPP_CMAKE_FLAGS "${DEFAULT_CMAKE_FLAGS}")
# DPCPP already generates debug libs, there isn't much point in compiling it in debug mode itself.
string(REPLACE "-DCMAKE_BUILD_TYPE=Debug" "-DCMAKE_BUILD_TYPE=Release" DPCPP_CMAKE_FLAGS "${DPCPP_CMAKE_FLAGS}")
if(WIN32)
set(LLVM_GENERATOR "Ninja")
else()
@ -38,17 +41,18 @@ set(DPCPP_EXTRA_ARGS
-DLEVEL_ZERO_LIBRARY=${LIBDIR}/level-zero/lib/${LIBPREFIX}ze_loader${SHAREDLIBEXT}
-DLEVEL_ZERO_INCLUDE_DIR=${LIBDIR}/level-zero/include
-DLLVM_EXTERNAL_SPIRV_HEADERS_SOURCE_DIR=${BUILD_DIR}/spirvheaders/src/external_spirvheaders/
-DUNIFIED_RUNTIME_SOURCE_DIR=${BUILD_DIR}/unifiedruntime/src/external_unifiedruntime/
# Below here is copied from an invocation of buildbot/config.py
-DLLVM_ENABLE_ASSERTIONS=ON
-DLLVM_TARGETS_TO_BUILD=X86
-DLLVM_EXTERNAL_PROJECTS=sycl^^llvm-spirv^^opencl^^libdevice^^xpti^^xptifw
-DLLVM_EXTERNAL_PROJECTS=sycl^^llvm-spirv^^opencl^^libdevice^^xpti^^xptifw^^lld
-DLLVM_EXTERNAL_SYCL_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/sycl
-DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/llvm-spirv
-DLLVM_EXTERNAL_XPTI_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/xpti
-DXPTI_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/xpti
-DLLVM_EXTERNAL_XPTIFW_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/xptifw
-DLLVM_EXTERNAL_LIBDEVICE_SOURCE_DIR=${DPCPP_SOURCE_ROOT}/libdevice
-DLLVM_ENABLE_PROJECTS=clang^^sycl^^llvm-spirv^^opencl^^libdevice^^xpti^^xptifw
-DLLVM_ENABLE_PROJECTS=clang^^sycl^^llvm-spirv^^opencl^^libdevice^^xpti^^xptifw^^lld
-DLIBCLC_TARGETS_TO_BUILD=
-DLIBCLC_GENERATE_REMANGLED_VARIANTS=OFF
-DSYCL_BUILD_PI_HIP_PLATFORM=AMD
@ -104,13 +108,13 @@ add_dependencies(
external_mp11
external_level-zero
external_spirvheaders
external_unifiedruntime
)
if(BUILD_MODE STREQUAL Release AND WIN32)
ExternalProject_Add_Step(external_dpcpp after_install
COMMAND ${CMAKE_COMMAND} -E rm -f ${LIBDIR}/dpcpp/bin/clang-cl.exe
COMMAND ${CMAKE_COMMAND} -E rm -f ${LIBDIR}/dpcpp/bin/clang-cpp.exe
COMMAND ${CMAKE_COMMAND} -E rm -f ${LIBDIR}/dpcpp/bin/clang.exe
COMMAND ${CMAKE_COMMAND} -E copy_directory ${LIBDIR}/dpcpp ${HARVEST_TARGET}/dpcpp
)
endif()

View File

@ -59,3 +59,13 @@ ExternalProject_Add(external_spirvheaders
BUILD_COMMAND echo .
INSTALL_COMMAND echo .
)
ExternalProject_Add(external_unifiedruntime
URL file://${PACKAGE_DIR}/${UNIFIED_RUNTIME_FILE}
URL_HASH ${UNIFIED_RUNTIME_HASH_TYPE}=${UNIFIED_RUNTIME_HASH}
DOWNLOAD_DIR ${DOWNLOAD_DIR}
PREFIX ${BUILD_DIR}/unifiedruntime
CONFIGURE_COMMAND echo .
BUILD_COMMAND echo .
INSTALL_COMMAND echo .
)

View File

@ -3,6 +3,8 @@
# Note the utility apps may use png/tiff/gif system libraries, but the
# library itself does not depend on them, so should give no problems.
set(EMBREE_CMAKE_FLAGS ${DEFAULT_CMAKE_FLAGS})
set(EMBREE_EXTRA_ARGS
-DEMBREE_ISPC_SUPPORT=OFF
-DEMBREE_TUTORIALS=OFF
@ -31,6 +33,43 @@ if(NOT BLENDER_PLATFORM_ARM)
)
endif()
if(NOT APPLE)
if(WIN32)
# Levels below -O2 don't work well for Embree+SYCL.
string(REGEX REPLACE "-O[A-Za-z0-9]" "" EMBREE_CLANG_CMAKE_CXX_FLAGS_DEBUG ${BLENDER_CLANG_CMAKE_C_FLAGS_DEBUG})
string(APPEND EMBREE_CLANG_CMAKE_CXX_FLAGS_DEBUG " -O2")
string(REGEX REPLACE "-O[A-Za-z0-9]" "" EMBREE_CLANG_CMAKE_C_FLAGS_DEBUG ${BLENDER_CLANG_CMAKE_C_FLAGS_DEBUG})
string(APPEND EMBREE_CLANG_CMAKE_C_FLAGS_DEBUG " -O2")
set(EMBREE_CMAKE_FLAGS
-DCMAKE_BUILD_TYPE=${BUILD_MODE}
-DCMAKE_CXX_FLAGS_RELEASE=${BLENDER_CLANG_CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_MINSIZEREL=${BLENDER_CLANG_CMAKE_CXX_FLAGS_MINSIZEREL}
-DCMAKE_CXX_FLAGS_RELWITHDEBINFO=${BLENDER_CLANG_CMAKE_CXX_FLAGS_RELWITHDEBINFO}
-DCMAKE_CXX_FLAGS_DEBUG=${EMBREE_CLANG_CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${BLENDER_CLANG_CMAKE_C_FLAGS_RELEASE}
-DCMAKE_C_FLAGS_MINSIZEREL=${BLENDER_CLANG_CMAKE_C_FLAGS_MINSIZEREL}
-DCMAKE_C_FLAGS_RELWITHDEBINFO=${BLENDER_CLANG_CMAKE_C_FLAGS_RELWITHDEBINFO}
-DCMAKE_C_FLAGS_DEBUG=${EMBREE_CLANG_CMAKE_C_FLAGS_DEBUG}
-DCMAKE_CXX_STANDARD=17
)
set(EMBREE_EXTRA_ARGS
-DCMAKE_CXX_COMPILER=${LIBDIR}/dpcpp/bin/clang++.exe
-DCMAKE_C_COMPILER=${LIBDIR}/dpcpp/bin/clang.exe
-DCMAKE_SHARED_LINKER_FLAGS=-L"${LIBDIR}/dpcpp/lib"
-DEMBREE_SYCL_SUPPORT=ON
${EMBREE_EXTRA_ARGS}
)
else()
set(EMBREE_EXTRA_ARGS
-DCMAKE_CXX_COMPILER=${LIBDIR}/dpcpp/bin/clang++
-DCMAKE_C_COMPILER=${LIBDIR}/dpcpp/bin/clang
-DCMAKE_SHARED_LINKER_FLAGS=-L"${LIBDIR}/dpcpp/lib"
-DEMBREE_SYCL_SUPPORT=ON
${EMBREE_EXTRA_ARGS}
)
endif()
endif()
if(TBB_STATIC_LIBRARY)
set(EMBREE_EXTRA_ARGS
${EMBREE_EXTRA_ARGS}
@ -42,16 +81,25 @@ ExternalProject_Add(external_embree
URL file://${PACKAGE_DIR}/${EMBREE_FILE}
DOWNLOAD_DIR ${DOWNLOAD_DIR}
URL_HASH ${EMBREE_HASH_TYPE}=${EMBREE_HASH}
CMAKE_GENERATOR ${PLATFORM_ALT_GENERATOR}
PREFIX ${BUILD_DIR}/embree
PATCH_COMMAND ${PATCH_CMD} -p 1 -d ${BUILD_DIR}/embree/src/external_embree < ${PATCH_DIR}/embree.diff
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/embree ${DEFAULT_CMAKE_FLAGS} ${EMBREE_EXTRA_ARGS}
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBDIR}/embree ${EMBREE_CMAKE_FLAGS} ${EMBREE_EXTRA_ARGS}
INSTALL_DIR ${LIBDIR}/embree
)
add_dependencies(
external_embree
external_tbb
)
if(NOT APPLE)
add_dependencies(
external_embree
external_tbb
external_dpcpp
)
else()
add_dependencies(
external_embree
external_tbb
)
endif()
if(WIN32)
if(BUILD_MODE STREQUAL Release)
@ -66,6 +114,7 @@ if(WIN32)
ExternalProject_Add_Step(external_embree after_install
COMMAND ${CMAKE_COMMAND} -E copy ${LIBDIR}/embree/bin/embree4_d.dll ${HARVEST_TARGET}/embree/bin/embree4_d.dll
COMMAND ${CMAKE_COMMAND} -E copy ${LIBDIR}/embree/lib/embree4_d.lib ${HARVEST_TARGET}/embree/lib/embree4_d.lib
COMMAND ${CMAKE_COMMAND} -E copy ${LIBDIR}/embree/lib/embree4_sycl_d.lib ${HARVEST_TARGET}/embree/lib/embree4_sycl_d.lib
DEPENDEES install
)
endif()

View File

@ -74,6 +74,27 @@ if(WIN32)
set(BLENDER_CMAKE_CXX_FLAGS_RELEASE "/MD ${COMMON_MSVC_FLAGS} /D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS /O2 /Ob2 /D NDEBUG /D PLATFORM_WINDOWS /DPSAPI_VERSION=2 /DTINYFORMAT_ALLOW_WCHAR_STRINGS")
set(BLENDER_CMAKE_CXX_FLAGS_RELWITHDEBINFO "/MD ${COMMON_MSVC_FLAGS} /D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS /Zi /O2 /Ob1 /D NDEBUG /D PLATFORM_WINDOWS /DPSAPI_VERSION=2 /DTINYFORMAT_ALLOW_WCHAR_STRINGS")
# Set similar flags for CLANG compilation.
set(COMMON_CLANG_FLAGS "-D_DLL -D_MT") # Equivalent to MSVC /MD
if(WITH_OPTIMIZED_DEBUG)
set(BLENDER_CLANG_CMAKE_C_FLAGS_DEBUG "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrtd -O2 -D_DEBUG -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
else()
set(BLENDER_CLANG_CMAKE_C_FLAGS_DEBUG "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrtd -g -D_DEBUG -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
endif()
set(BLENDER_CLANG_CMAKE_C_FLAGS_MINSIZEREL "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -Os -DNDEBUG -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
set(BLENDER_CLANG_CMAKE_C_FLAGS_RELEASE "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -O2 -DNDEBUG -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
set(BLENDER_CLANG_CMAKE_C_FLAGS_RELWITHDEBINFO "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -g -O2 -DNDEBUG -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
if(WITH_OPTIMIZED_DEBUG)
set(BLENDER_CLANG_CMAKE_CXX_FLAGS_DEBUG "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrtd -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS -O2 -D_DEBUG -DPLATFORM_WINDOWS -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS -DBOOST_DEBUG_PYTHON -DBOOST_ALL_NO_LIB")
else()
set(BLENDER_CLANG_CMAKE_CXX_FLAGS_DEBUG "${COMMON_CLANG_FLAG} -Xclang --dependent-lib=msvcrtd -D_DEBUG -DPLATFORM_WINDOWS -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS -g -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS -DBOOST_DEBUG_PYTHON -DBOOST_ALL_NO_LIB")
endif()
set(BLENDER_CLANG_CMAKE_CXX_FLAGS_MINSIZEREL "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS -O2 -DNDEBUG -DPLATFORM_WINDOWS -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
set(BLENDER_CLANG_CMAKE_CXX_FLAGS_RELEASE "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS -O2 -DNDEBUG -DPLATFORM_WINDOWS -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
set(BLENDER_CLANG_CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_CLANG_FLAGS} -Xclang --dependent-lib=msvcrt -D_SILENCE_ALL_CXX17_DEPRECATION_WARNINGS -g -O2 -DNDEBUG -DPLATFORM_WINDOWS -DPSAPI_VERSION=2 -DTINYFORMAT_ALLOW_WCHAR_STRINGS")
set(PLATFORM_FLAGS)
set(PLATFORM_CXX_FLAGS)
set(PLATFORM_CMAKE_FLAGS)

View File

@ -599,15 +599,15 @@ set(OPENPGL_HASH db63f5dac5cfa8c110ede241f0c413f00db0c4748697381c4fa23e0f9e82a75
set(OPENPGL_HASH_TYPE SHA256)
set(OPENPGL_FILE openpgl-${OPENPGL_VERSION}.tar.gz)
set(LEVEL_ZERO_VERSION v1.8.5)
set(LEVEL_ZERO_VERSION v1.8.8)
set(LEVEL_ZERO_URI https://github.com/oneapi-src/level-zero/archive/refs/tags/${LEVEL_ZERO_VERSION}.tar.gz)
set(LEVEL_ZERO_HASH b6e9663bbcc53c148d32376998298bec6f7c434ef2218c61fa708963e3a09394)
set(LEVEL_ZERO_HASH 3553ae8fa0d2d69c4210a8f3428bd6612bd8bb8a627faf52c3658a01851e66d2)
set(LEVEL_ZERO_HASH_TYPE SHA256)
set(LEVEL_ZERO_FILE level-zero-${LEVEL_ZERO_VERSION}.tar.gz)
set(DPCPP_VERSION 20221019)
set(DPCPP_URI https://github.com/intel/llvm/archive/refs/tags/sycl-nightly/${DPCPP_VERSION}.tar.gz)
set(DPCPP_HASH 2f533946e91ce3829431758ea17b0b834b960c1a796e9e4563c86e03eb9603a2)
set(DPCPP_VERSION 2022-12)
set(DPCPP_URI https://github.com/intel/llvm/archive/refs/tags/${DPCPP_VERSION}.tar.gz)
set(DPCPP_HASH 13151d5ae79f7c9c4a9b072a0c486ae7b3c4993e301bb1268c92214451025790)
set(DPCPP_HASH_TYPE SHA256)
set(DPCPP_FILE DPCPP-${DPCPP_VERSION}.tar.gz)
@ -620,9 +620,9 @@ set(DPCPP_FILE DPCPP-${DPCPP_VERSION}.tar.gz)
# will take care of building them, unpack is being done in dpcpp_deps.cmake
# Source llvm/lib/SYCLLowerIR/CMakeLists.txt
set(VCINTRINSICS_VERSION abce9184b7a3a7fe1b02289b9285610d9dc45465)
set(VCINTRINSICS_VERSION 782fbf7301dc73acaa049a4324c976ad94f587f7)
set(VCINTRINSICS_URI https://github.com/intel/vc-intrinsics/archive/${VCINTRINSICS_VERSION}.tar.gz)
set(VCINTRINSICS_HASH 3e9fd471246b87633b26f7e15e17ab7733d357458c53d5c5881c03929d6c551f)
set(VCINTRINSICS_HASH f4c0ccad8c1f77760364c551c65e8e1cf194d058889fa46d3b1b2d19ec4dc33f)
set(VCINTRINSICS_HASH_TYPE SHA256)
set(VCINTRINSICS_FILE vc-intrinsics-${VCINTRINSICS_VERSION}.tar.gz)
@ -657,6 +657,13 @@ set(SPIRV_HEADERS_HASH ec8ecb471a62672697846c436501638ab25447ae9d4a6761e0bfe8a9a
set(SPIRV_HEADERS_HASH_TYPE SHA256)
set(SPIRV_HEADERS_FILE SPIR-V-Headers-${SPIRV_HEADERS_VERSION}.tar.gz)
# Source llvm/sycl/plugins/unified_runtime/CMakeLists.txt
set(UNIFIED_RUNTIME_VERSION fd711c920acc4434cb52ff18b078c082d9d7f44d)
set(UNIFIED_RUNTIME_URI https://github.com/oneapi-src/unified-runtime/archive/${UNIFIED_RUNTIME_VERSION}.tar.gz)
set(UNIFIED_RUNTIME_HASH 535ca2ee78f68c5e7e62b10f1bbabd909179488885566e6d9b1fc50e8a1be65f)
set(UNIFIED_RUNTIME_HASH_TYPE SHA256)
set(UNIFIED_RUNTIME_FILE unified-runtime-${UNIFIED_RUNTIME_VERSION}.tar.gz)
######################
### DPCPP DEPS END ###
######################
@ -730,9 +737,9 @@ set(GMMLIB_HASH c1f33e1519edfc527127baeb0436b783430dfd256c643130169a3a71dc86aff9
set(GMMLIB_HASH_TYPE SHA256)
set(GMMLIB_FILE ${GMMLIB_VERSION}.tar.gz)
set(OCLOC_VERSION 22.49.25018.21)
set(OCLOC_VERSION 23.05.25593.18)
set(OCLOC_URI https://github.com/intel/compute-runtime/archive/refs/tags/${OCLOC_VERSION}.tar.gz)
set(OCLOC_HASH 92362dae08b503a34e5d3820ed284198c452bcd5e7504d90eb69887b20492c06)
set(OCLOC_HASH 122415028e631922ae999c996954dfd98ce9a32decd564d5484c31476ec9306e)
set(OCLOC_HASH_TYPE SHA256)
set(OCLOC_FILE ocloc-${OCLOC_VERSION}.tar.gz)

View File

@ -14,6 +14,7 @@ graph[autosize = false, size = "25.7,8.3!", resolution = 300];
external_dpcpp -- external_mp11;
external_dpcpp -- external_level_zero;
external_dpcpp -- external_spirvheaders;
external_dpcpp -- external_unifiedruntime;
external_embree -- external_tbb;
external_ffmpeg -- external_zlib;
external_ffmpeg -- external_openjpeg;

View File

@ -34,3 +34,156 @@ diff -Naur llvm-sycl-nightly-20220208.orig/libdevice/cmake/modules/SYCLLibdevice
libsycldevice-obj
libsycldevice-spv)
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 17eeaafae194..09e6d2217aaa 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -1647,46 +1647,120 @@ ProgramManager::getSYCLDeviceImagesWithCompatibleState(
}
assert(BinImages.size() > 0 && "Expected to find at least one device image");
+ // Ignore images with incompatible state. Image is considered compatible
+ // with a target state if an image is already in the target state or can
+ // be brought to target state by compiling/linking/building.
+ //
+ // Example: an image in "executable" state is not compatible with
+ // "input" target state - there is no operation to convert the image it
+ // to "input" state. An image in "input" state is compatible with
+ // "executable" target state because it can be built to get into
+ // "executable" state.
+ for (auto It = BinImages.begin(); It != BinImages.end();) {
+ if (getBinImageState(*It) > TargetState)
+ It = BinImages.erase(It);
+ else
+ ++It;
+ }
+
std::vector<device_image_plain> SYCLDeviceImages;
- for (RTDeviceBinaryImage *BinImage : BinImages) {
- const bundle_state ImgState = getBinImageState(BinImage);
-
- // Ignore images with incompatible state. Image is considered compatible
- // with a target state if an image is already in the target state or can
- // be brought to target state by compiling/linking/building.
- //
- // Example: an image in "executable" state is not compatible with
- // "input" target state - there is no operation to convert the image it
- // to "input" state. An image in "input" state is compatible with
- // "executable" target state because it can be built to get into
- // "executable" state.
- if (ImgState > TargetState)
- continue;
- for (const sycl::device &Dev : Devs) {
+ // If a non-input state is requested, we can filter out some compatible
+ // images and return only those with the highest compatible state for each
+ // device-kernel pair. This map tracks how many kernel-device pairs need each
+ // image, so that any unneeded ones are skipped.
+ // TODO this has no effect if the requested state is input, consider having
+ // a separate branch for that case to avoid unnecessary tracking work.
+ struct DeviceBinaryImageInfo {
+ std::shared_ptr<std::vector<sycl::kernel_id>> KernelIDs;
+ bundle_state State = bundle_state::input;
+ int RequirementCounter = 0;
+ };
+ std::unordered_map<RTDeviceBinaryImage *, DeviceBinaryImageInfo> ImageInfoMap;
+
+ for (const sycl::device &Dev : Devs) {
+ // Track the highest image state for each requested kernel.
+ using StateImagesPairT =
+ std::pair<bundle_state, std::vector<RTDeviceBinaryImage *>>;
+ using KernelImageMapT =
+ std::map<kernel_id, StateImagesPairT, LessByNameComp>;
+ KernelImageMapT KernelImageMap;
+ if (!KernelIDs.empty())
+ for (const kernel_id &KernelID : KernelIDs)
+ KernelImageMap.insert({KernelID, {}});
+
+ for (RTDeviceBinaryImage *BinImage : BinImages) {
if (!compatibleWithDevice(BinImage, Dev) ||
!doesDevSupportImgAspects(Dev, *BinImage))
continue;
- std::shared_ptr<std::vector<sycl::kernel_id>> KernelIDs;
- // Collect kernel names for the image
- {
- std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
- KernelIDs = m_BinImg2KernelIDs[BinImage];
- // If the image does not contain any non-service kernels we can skip it.
- if (!KernelIDs || KernelIDs->empty())
- continue;
+ auto InsertRes = ImageInfoMap.insert({BinImage, {}});
+ DeviceBinaryImageInfo &ImgInfo = InsertRes.first->second;
+ if (InsertRes.second) {
+ ImgInfo.State = getBinImageState(BinImage);
+ // Collect kernel names for the image
+ {
+ std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
+ ImgInfo.KernelIDs = m_BinImg2KernelIDs[BinImage];
+ }
}
+ const bundle_state ImgState = ImgInfo.State;
+ const std::shared_ptr<std::vector<sycl::kernel_id>> &ImageKernelIDs =
+ ImgInfo.KernelIDs;
+ int &ImgRequirementCounter = ImgInfo.RequirementCounter;
- DeviceImageImplPtr Impl = std::make_shared<detail::device_image_impl>(
- BinImage, Ctx, Devs, ImgState, KernelIDs, /*PIProgram=*/nullptr);
+ // If the image does not contain any non-service kernels we can skip it.
+ if (!ImageKernelIDs || ImageKernelIDs->empty())
+ continue;
- SYCLDeviceImages.push_back(
- createSyclObjFromImpl<device_image_plain>(Impl));
- break;
+ // Update tracked information.
+ for (kernel_id &KernelID : *ImageKernelIDs) {
+ StateImagesPairT *StateImagesPair;
+ // If only specific kernels are requested, ignore the rest.
+ if (!KernelIDs.empty()) {
+ auto It = KernelImageMap.find(KernelID);
+ if (It == KernelImageMap.end())
+ continue;
+ StateImagesPair = &It->second;
+ } else
+ StateImagesPair = &KernelImageMap[KernelID];
+
+ auto &[KernelImagesState, KernelImages] = *StateImagesPair;
+
+ if (KernelImages.empty()) {
+ KernelImagesState = ImgState;
+ KernelImages.push_back(BinImage);
+ ++ImgRequirementCounter;
+ } else if (KernelImagesState < ImgState) {
+ for (RTDeviceBinaryImage *Img : KernelImages) {
+ auto It = ImageInfoMap.find(Img);
+ assert(It != ImageInfoMap.end());
+ assert(It->second.RequirementCounter > 0);
+ --(It->second.RequirementCounter);
+ }
+ KernelImages.clear();
+ KernelImages.push_back(BinImage);
+ KernelImagesState = ImgState;
+ ++ImgRequirementCounter;
+ } else if (KernelImagesState == ImgState) {
+ KernelImages.push_back(BinImage);
+ ++ImgRequirementCounter;
+ }
+ }
}
}
+ for (const auto &ImgInfoPair : ImageInfoMap) {
+ if (ImgInfoPair.second.RequirementCounter == 0)
+ continue;
+
+ DeviceImageImplPtr Impl = std::make_shared<detail::device_image_impl>(
+ ImgInfoPair.first, Ctx, Devs, ImgInfoPair.second.State,
+ ImgInfoPair.second.KernelIDs, /*PIProgram=*/nullptr);
+
+ SYCLDeviceImages.push_back(createSyclObjFromImpl<device_image_plain>(Impl));
+ }
+
return SYCLDeviceImages;
}

View File

@ -149,3 +149,19 @@ index 074f910a2..30f490818 100644
return is_hit_first | is_hit_second;
}
};
diff -ruN a/kernels/sycl/rthwif_embree_builder.cpp b/kernels/sycl/rthwif_embree_builder.cpp
--- a/kernels/sycl/rthwif_embree_builder.cpp 2023-03-28 17:23:06.429190200 +0200
+++ b/kernels/sycl/rthwif_embree_builder.cpp 2023-03-28 17:35:01.291938600 +0200
@@ -540,7 +540,12 @@
assert(offset <= geomDescrData.size());
}
+ /* Force running BVH building sequentially from the calling thread if using TBB < 2021, as it otherwise leads to runtime issues. */
+#if TBB_VERSION_MAJOR<2021
+ RTHWIF_PARALLEL_OPERATION parallelOperation = nullptr;
+#else
RTHWIF_PARALLEL_OPERATION parallelOperation = rthwifNewParallelOperation();
+#endif
/* estimate static accel size */
BBox1f time_range(0,1);

View File

@ -281,6 +281,9 @@ endif()
if(WITH_CYCLES_EMBREE)
add_definitions(-DWITH_EMBREE)
if(WITH_CYCLES_DEVICE_ONEAPI AND EMBREE_SYCL_SUPPORT)

if(WITH_CYCLES_DEVICE_ONEAPI AND EMBREE_SYCL_SUPPORT)

`if(WITH_CYCLES_DEVICE_ONEAPI AND EMBREE_SYCL_SUPPORT)`
add_definitions(-DWITH_EMBREE_GPU)
endif()
add_definitions(-DEMBREE_MAJOR_VERSION=${EMBREE_MAJOR_VERSION})
include_directories(
SYSTEM

View File

@ -1544,6 +1544,13 @@ class CyclesPreferences(bpy.types.AddonPreferences):
default=False,
)
use_oneapirt: BoolProperty(
name="Hardware Ray Tracing",

Users don't need to know what Embree is. Can you juse name this "Hardware Ray Tracing"?

Users don't need to know what Embree is. Can you juse name this "Hardware Ray Tracing"?
description="Embree GPU execution will allow to use hardware ray tracing on Intel GPUs, which will provide better performance. "

HW RT for ray tracing -> hardware ray tracing

However this support is experimental and some scenes may render incorrectly

Is this true? If it's experimental it should not be enabled by default.

HW RT for ray tracing -> hardware ray tracing > However this support is experimental and some scenes may render incorrectly Is this true? If it's experimental it should not be enabled by default.

It's close to not being experimental anymore, we can keep it off and switch it on during bcon2.

It's close to not being experimental anymore, we can keep it off and switch it on during bcon2.
"However this support is experimental and some scenes may render incorrectly",
default=False,
)
kernel_optimization_level: EnumProperty(
name="Kernel Optimization",
description="Kernels can be optimized based on scene content. Optimized kernels are requested at the start of a render. "
@ -1763,6 +1770,11 @@ class CyclesPreferences(bpy.types.AddonPreferences):
col.prop(self, "kernel_optimization_level")
col.prop(self, "use_metalrt")
if compute_device_type == 'ONEAPI' and _cycles.with_embree_gpu:
row = layout.row()
row.use_property_split = True
row.prop(self, "use_oneapirt")
def draw(self, context):
self.draw_impl(self.layout, context)

View File

@ -112,9 +112,26 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences,
device.has_peer_memory = false;
}
if (get_boolean(cpreferences, "use_metalrt")) {
device.use_metalrt = true;
bool accumulated_use_hardware_raytracing = false;
foreach (
DeviceInfo &info,
(device.multi_devices.size() != 0 ? device.multi_devices : vector<DeviceInfo>({device}))) {
if (info.type == DEVICE_METAL && !get_boolean(cpreferences, "use_metalrt")) {
info.use_hardware_raytracing = false;
}
if (info.type == DEVICE_ONEAPI && !get_boolean(cpreferences, "use_oneapirt")) {
info.use_hardware_raytracing = false;
}
/* There is an accumulative logic here, because Multidevices are support only for
* the same backend + CPU in Blender right now, and both oneAPI and Metal have a
* global boolean backend setting (see above) for enabling/disabling HW RT,
* so all subdevices in the multidevice should enable (or disable) HW RT
* simultaneously (and CPU device are expected to ignore "use_hardware_raytracing" setting) */
accumulated_use_hardware_raytracing |= info.use_hardware_raytracing;
}
device.use_hardware_raytracing = accumulated_use_hardware_raytracing;
if (preview) {
/* Disable specialization for preview renders. */

View File

@ -1034,6 +1034,14 @@ void *CCL_python_module_init()
Py_INCREF(Py_False);
#endif /* WITH_EMBREE */
#ifdef WITH_EMBREE_GPU
PyModule_AddObject(mod, "with_embree_gpu", Py_True);
Py_INCREF(Py_True);
#else /* WITH_EMBREE_GPU */
PyModule_AddObject(mod, "with_embree_gpu", Py_False);
Py_INCREF(Py_False);
#endif /* WITH_EMBREE_GPU */
if (ccl::openimagedenoise_supported()) {
PyModule_AddObject(mod, "with_openimagedenoise", Py_True);
Py_INCREF(Py_True);

View File

@ -84,7 +84,7 @@ CPUDevice::~CPUDevice()
texture_info.free();
}
BVHLayoutMask CPUDevice::get_bvh_layout_mask() const
BVHLayoutMask CPUDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
{
BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
#ifdef WITH_EMBREE

View File

@ -56,7 +56,7 @@ class CPUDevice : public Device {
CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
~CPUDevice();
virtual BVHLayoutMask get_bvh_layout_mask() const override;
virtual BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override;
/* Returns true if the texture info was copied to the device (meaning, some more
* re-initialization might be needed). */

View File

@ -35,7 +35,7 @@ bool CUDADevice::have_precompiled_kernels()
return path_exists(cubins_path);
}
BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
BVHLayoutMask CUDADevice::get_bvh_layout_mask(uint /*kernel_features*/) const
{
return BVH_LAYOUT_BVH2;
}

View File

@ -38,7 +38,7 @@ class CUDADevice : public GPUDevice {
static bool have_precompiled_kernels();
virtual BVHLayoutMask get_bvh_layout_mask() const override;
virtual BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override;
void set_error(const string &error) override;

View File

@ -354,7 +354,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
info.has_guiding = true;
info.has_profiling = true;
info.has_peer_memory = false;
info.use_metalrt = false;
info.use_hardware_raytracing = false;
info.denoisers = DENOISER_ALL;
foreach (const DeviceInfo &device, subdevices) {
@ -403,7 +403,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo> &subdevices,
info.has_guiding &= device.has_guiding;
info.has_profiling &= device.has_profiling;
info.has_peer_memory |= device.has_peer_memory;
info.use_metalrt |= device.use_metalrt;
info.use_hardware_raytracing |= device.use_hardware_raytracing;
info.denoisers &= device.denoisers;
}

View File

@ -71,15 +71,16 @@ class DeviceInfo {
string description;
string id; /* used for user preferences, should stay fixed with changing hardware config */
int num;
bool display_device; /* GPU is used as a display device. */
bool has_nanovdb; /* Support NanoVDB volumes. */
bool has_light_tree; /* Support light tree. */
bool has_osl; /* Support Open Shading Language. */
bool has_guiding; /* Support path guiding. */
bool has_profiling; /* Supports runtime collection of profiling info. */
bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
bool has_gpu_queue; /* Device supports GPU queue. */
bool use_metalrt; /* Use MetalRT to accelerate ray queries (Metal only). */
bool display_device; /* GPU is used as a display device. */
bool has_nanovdb; /* Support NanoVDB volumes. */
bool has_light_tree; /* Support light tree. */
bool has_osl; /* Support Open Shading Language. */
bool has_guiding; /* Support path guiding. */
bool has_profiling; /* Supports runtime collection of profiling info. */
bool has_peer_memory; /* GPU has P2P access to memory of another GPU. */
bool has_gpu_queue; /* Device supports GPU queue. */
bool use_hardware_raytracing; /* Use hardware ray tracing to accelerate ray queries in a backend.

use_hardware_raytracing to avoid obscure abbreviation.

`use_hardware_raytracing` to avoid obscure abbreviation.
*/
KernelOptimizationLevel kernel_optimization_level; /* Optimization level applied to path tracing
* kernels (Metal only). */
DenoiserTypeMask denoisers; /* Supported denoiser types. */
@ -101,7 +102,7 @@ class DeviceInfo {
has_profiling = false;
has_peer_memory = false;
has_gpu_queue = false;
use_metalrt = false;
use_hardware_raytracing = false;
denoisers = DENOISER_NONE;
}
@ -157,7 +158,7 @@ class Device {
fprintf(stderr, "%s\n", error.c_str());
fflush(stderr);
}
virtual BVHLayoutMask get_bvh_layout_mask() const = 0;
virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const = 0;
/* statistics */
Stats &stats;

View File

@ -20,7 +20,7 @@ class DummyDevice : public Device {
~DummyDevice() {}
virtual BVHLayoutMask get_bvh_layout_mask() const override
virtual BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override
{
return 0;
}

View File

@ -35,7 +35,7 @@ bool HIPDevice::have_precompiled_kernels()
return path_exists(fatbins_path);
}
BVHLayoutMask HIPDevice::get_bvh_layout_mask() const
BVHLayoutMask HIPDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
{
return BVH_LAYOUT_BVH2;
}

View File

@ -35,7 +35,7 @@ class HIPDevice : public GPUDevice {
static bool have_precompiled_kernels();
virtual BVHLayoutMask get_bvh_layout_mask() const override;
virtual BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override;
void set_error(const string &error) override;

View File

@ -3,7 +3,9 @@
#include "device/kernel.h"
#include "util/log.h"
#ifndef __KERNEL_ONEAPI__
# include "util/log.h"
#endif
CCL_NAMESPACE_BEGIN
@ -153,10 +155,13 @@ const char *device_kernel_as_string(DeviceKernel kernel)
case DEVICE_KERNEL_NUM:
break;
};
#ifndef __KERNEL_ONEAPI__
LOG(FATAL) << "Unhandled kernel " << static_cast<int>(kernel) << ", should never happen.";
#endif
return "UNKNOWN";
}
#ifndef __KERNEL_ONEAPI__
std::ostream &operator<<(std::ostream &os, DeviceKernel kernel)
{
os << device_kernel_as_string(kernel);
@ -178,5 +183,6 @@ string device_kernel_mask_as_string(DeviceKernelMask mask)
return str;
}
#endif
CCL_NAMESPACE_END

View File

@ -3,11 +3,13 @@
#pragma once
#include "kernel/types.h"
#ifndef __KERNEL_ONEAPI__
# include "kernel/types.h"
#include "util/string.h"
# include "util/string.h"
#include <ostream> // NOLINT
# include <ostream> // NOLINT
#endif
CCL_NAMESPACE_BEGIN
@ -15,9 +17,12 @@ bool device_kernel_has_shading(DeviceKernel kernel);
bool device_kernel_has_intersection(DeviceKernel kernel);
const char *device_kernel_as_string(DeviceKernel kernel);
#ifndef __KERNEL_ONEAPI__
std::ostream &operator<<(std::ostream &os, DeviceKernel kernel);
typedef uint64_t DeviceKernelMask;
string device_kernel_mask_as_string(DeviceKernelMask mask);
#endif
CCL_NAMESPACE_END

View File

@ -100,7 +100,7 @@ class MetalDevice : public Device {
virtual void cancel() override;
virtual BVHLayoutMask get_bvh_layout_mask() const override;
virtual BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override;
void set_error(const string &error) override;

View File

@ -39,7 +39,7 @@ bool MetalDevice::is_device_cancelled(int ID)
return get_device_by_ID(ID, lock) == nullptr;
}
BVHLayoutMask MetalDevice::get_bvh_layout_mask() const
BVHLayoutMask MetalDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
{
return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2;
}
@ -100,12 +100,12 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
}
case METAL_GPU_AMD: {
max_threads_per_threadgroup = 128;
use_metalrt = info.use_metalrt;
use_metalrt = info.use_hardware_raytracing;
break;
}
case METAL_GPU_APPLE: {
max_threads_per_threadgroup = 512;
use_metalrt = info.use_metalrt;
use_metalrt = info.use_hardware_raytracing;
break;
}
}

View File

@ -96,12 +96,13 @@ class MultiDevice : public Device {
return error_msg;
}
virtual BVHLayoutMask get_bvh_layout_mask() const override
virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override
{
BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_ALL;
BVHLayoutMask bvh_layout_mask_all = BVH_LAYOUT_NONE;
foreach (const SubDevice &sub_device, devices) {
BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask();
BVHLayoutMask device_bvh_layout_mask = sub_device.device->get_bvh_layout_mask(
kernel_features);
bvh_layout_mask &= device_bvh_layout_mask;
bvh_layout_mask_all |= device_bvh_layout_mask;
}

View File

@ -40,12 +40,12 @@ bool device_oneapi_init()
if (getenv("SYCL_CACHE_TRESHOLD") == nullptr) {
_putenv_s("SYCL_CACHE_THRESHOLD", "0");
}
if (getenv("SYCL_DEVICE_FILTER") == nullptr) {
if (getenv("ONEAPI_DEVICE_SELECTOR") == nullptr) {
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") == nullptr) {
_putenv_s("SYCL_DEVICE_FILTER", "level_zero");
_putenv_s("ONEAPI_DEVICE_SELECTOR", "level_zero:*");
}
else {
_putenv_s("SYCL_DEVICE_FILTER", "level_zero,cuda,hip");
_putenv_s("ONEAPI_DEVICE_SELECTOR", "!opencl:*");
}
}
if (getenv("SYCL_ENABLE_PCI") == nullptr) {
@ -58,10 +58,10 @@ bool device_oneapi_init()
setenv("SYCL_CACHE_PERSISTENT", "1", false);
setenv("SYCL_CACHE_THRESHOLD", "0", false);
if (getenv("CYCLES_ONEAPI_ALL_DEVICES") == nullptr) {
setenv("SYCL_DEVICE_FILTER", "level_zero", false);
setenv("ONEAPI_DEVICE_SELECTOR", "level_zero:*", false);
}
else {
setenv("SYCL_DEVICE_FILTER", "level_zero,cuda,hip", false);
setenv("ONEAPI_DEVICE_SELECTOR", "!opencl:*", false);
}
setenv("SYCL_ENABLE_PCI", "1", false);
setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
@ -87,7 +87,8 @@ Device *device_oneapi_create(const DeviceInfo &info, Stats &stats, Profiler &pro
}
#ifdef WITH_ONEAPI
static void device_iterator_cb(const char *id, const char *name, int num, void *user_ptr)
static void device_iterator_cb(
const char *id, const char *name, int num, bool hwrt_support, void *user_ptr)
{
vector<DeviceInfo> *devices = (vector<DeviceInfo> *)user_ptr;
@ -112,6 +113,12 @@ static void device_iterator_cb(const char *id, const char *name, int num, void *
/* NOTE(@nsirgien): Seems not possible to know from SYCL/oneAPI or Level0. */
info.display_device = false;
# if WITH_EMBREE_GPU

#ifdef WITH_EMBREE_GPU is enough

`#ifdef WITH_EMBREE_GPU` is enough
info.use_hardware_raytracing = hwrt_support;
# else
info.use_hardware_raytracing = false;
# endif
devices->push_back(info);
VLOG_INFO << "Added device \"" << name << "\" with id \"" << info.id << "\".";
}

View File

@ -8,7 +8,19 @@
# include "util/debug.h"
# include "util/log.h"
# ifdef WITH_EMBREE_GPU
# include "bvh/embree.h"
# endif
# include "kernel/device/oneapi/globals.h"
# include "kernel/device/oneapi/kernel.h"
# if defined(WITH_EMBREE_GPU) && defined(EMBREE_SYCL_SUPPORT) && !defined(SYCL_LANGUAGE_VERSION)
/* These declarations are missing from embree headers when compiling from a compiler that doesn't
* support SYCL. */
extern "C" RTCDevice rtcNewSYCLDevice(sycl::context context, const char *config);
extern "C" bool rtcIsSYCLDeviceSupported(const sycl::device sycl_device);
# endif
CCL_NAMESPACE_BEGIN
@ -22,16 +34,29 @@ static void queue_error_cb(const char *message, void *user_ptr)
OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler),
device_queue_(nullptr),
# ifdef WITH_EMBREE_GPU
embree_device(nullptr),
embree_scene(nullptr),
# endif
texture_info_(this, "texture_info", MEM_GLOBAL),
kg_memory_(nullptr),
kg_memory_device_(nullptr),
kg_memory_size_(0)
{
need_texture_info_ = false;

Hardware Raytracing -> hardware ray tracing

Hardware Raytracing -> hardware ray tracing
use_hardware_raytracing = info.use_hardware_raytracing;
oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
bool is_finished_ok = create_queue(device_queue_, info.num);
bool is_finished_ok = create_queue(device_queue_,
info.num,
# ifdef WITH_EMBREE_GPU
use_hardware_raytracing ? &embree_device : nullptr
# else
nullptr
# endif
);
xavierh marked this conversation as resolved

Can you move this before oneAPI will try to use Hardware Raytracing capabilities for intersection acceleration? So we can definitely say if it is used or not.

Can you move this before `oneAPI will try to use Hardware Raytracing capabilities for intersection acceleration`? So we can definitely say if it is used or not.
Review

done, I've moved the string to after this block since it also checks for embree_device init.

done, I've moved the string to after this block since it also checks for embree_device init.
if (is_finished_ok == false) {
set_error("oneAPI queue initialization error: got runtime exception \"" +
oneapi_error_string_ + "\"");
@ -42,6 +67,16 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
assert(device_queue_);
}
# ifdef WITH_EMBREE_GPU
use_hardware_raytracing = use_hardware_raytracing && (embree_device != nullptr);
# else
use_hardware_raytracing = false;
# endif
if (use_hardware_raytracing) {
VLOG_INFO << "oneAPI will use hardware ray tracing for intersection acceleration.";
}
size_t globals_segment_size;
is_finished_ok = kernel_globals_size(globals_segment_size);
if (is_finished_ok == false) {
@ -64,6 +99,11 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
OneapiDevice::~OneapiDevice()
{
# ifdef WITH_EMBREE_GPU
if (embree_device)
rtcReleaseDevice(embree_device);
# endif
texture_info_.free();
usm_free(device_queue_, kg_memory_);
usm_free(device_queue_, kg_memory_device_);
@ -80,15 +120,47 @@ bool OneapiDevice::check_peer_access(Device * /*peer_device*/)
return false;
}
BVHLayoutMask OneapiDevice::get_bvh_layout_mask() const
bool OneapiDevice::can_use_hardware_raytracing_for_features(uint requested_features) const
{
return BVH_LAYOUT_BVH2;
/* MNEE and Raytrace kernels currently don't work correctly with HWRT. */
return !(requested_features & (KERNEL_FEATURE_MNEE | KERNEL_FEATURE_NODE_RAYTRACE));
}
BVHLayoutMask OneapiDevice::get_bvh_layout_mask(uint requested_features) const
{
return (use_hardware_raytracing &&
can_use_hardware_raytracing_for_features(requested_features)) ?
BVH_LAYOUT_EMBREE :
BVH_LAYOUT_BVH2;
}
# ifdef WITH_EMBREE_GPU
void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
{
if (embree_device && bvh->params.bvh_layout == BVH_LAYOUT_EMBREE) {
BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
if (refit) {
bvh_embree->refit(progress);
}
xavierh marked this conversation as resolved

This should no longer be needed if KERNEL_FEATURE_HAIR is used.

This should no longer be needed if `KERNEL_FEATURE_HAIR` is used.
else {
bvh_embree->build(progress, &stats, embree_device);
}
if (bvh->params.top_level) {
embree_scene = bvh_embree->scene;
}
}
else {
Device::build_bvh(bvh, progress, refit);
}
}
# endif
bool OneapiDevice::load_kernels(const uint requested_features)
{
assert(device_queue_);
kernel_features = requested_features;
bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
if (is_finished_ok == false) {
set_error("oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ +
@ -100,7 +172,14 @@ bool OneapiDevice::load_kernels(const uint requested_features)
assert(device_queue_);

Hardware ray tracing disabled, not supported yet by oneAPI for requested features

> Hardware ray tracing disabled, not supported yet by oneAPI for requested features
}
is_finished_ok = oneapi_load_kernels(device_queue_, (const unsigned int)requested_features);
if (use_hardware_raytracing && !can_use_hardware_raytracing_for_features(requested_features)) {
VLOG_INFO
<< "Hardware ray tracing disabled, not supported yet by oneAPI for requested features.";
use_hardware_raytracing = false;
}
is_finished_ok = oneapi_load_kernels(
device_queue_, (const unsigned int)requested_features, use_hardware_raytracing);
if (is_finished_ok == false) {
set_error("oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ + "\"");
}
@ -327,6 +406,16 @@ void OneapiDevice::const_copy_to(const char *name, void *host, size_t size)
<< string_human_readable_number(size) << " bytes. ("
<< string_human_readable_size(size) << ")";
# ifdef WITH_EMBREE_GPU
if (strcmp(name, "data") == 0) {
assert(size <= sizeof(KernelData));
/* Update scene handle(since it is different for each device on multi devices) */
KernelData *const data = (KernelData *)host;
data->device_bvh = embree_scene;
}
# endif
ConstMemMap::iterator i = const_mem_map_.find(name);
device_vector<uchar> *data;
@ -446,7 +535,9 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
# endif
}

Can you move this to the top of the file, outside the namespace also? So the function declarations are not mixed with other code.

Can you move this to the top of the file, outside the namespace also? So the function declarations are not mixed with other code.
bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index)
bool OneapiDevice::create_queue(SyclQueue *&external_queue,
int device_index,
void *embree_device_pointer)
{
bool finished_correct = true;
try {
@ -457,6 +548,11 @@ bool OneapiDevice::create_queue(SyclQueue *&external_queue, int device_index)
sycl::queue *created_queue = new sycl::queue(devices[device_index],
sycl::property::queue::in_order());
external_queue = reinterpret_cast<SyclQueue *>(created_queue);
# ifdef WITH_EMBREE_GPU
if (embree_device_pointer) {
*((RTCDevice *)embree_device_pointer) = rtcNewSYCLDevice(created_queue->get_context(), "");
}
# endif
}
catch (sycl::exception const &e) {
finished_correct = false;
@ -625,7 +721,8 @@ bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
size_t global_size,
void **args)
{
return oneapi_enqueue_kernel(kernel_context, kernel, global_size, args);
return oneapi_enqueue_kernel(
kernel_context, kernel, global_size, kernel_features, use_hardware_raytracing, args);
}
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
@ -830,12 +927,17 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
std::string name = device.get_info<sycl::info::device::name>();
# else
std::string name = "SYCL Host Task (Debug)";
# endif
# ifdef WITH_EMBREE_GPU
bool hwrt_support = rtcIsSYCLDeviceSupported(device);
# else
bool hwrt_support = false;
# endif
std::string id = "ONEAPI_" + platform_name + "_" + name;
if (device.has(sycl::aspect::ext_intel_pci_address)) {
id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
}
(cb)(id.c_str(), name.c_str(), num, user_ptr);
(cb)(id.c_str(), name.c_str(), num, hwrt_support, user_ptr);
num++;
}
}

View File

@ -16,15 +16,16 @@ CCL_NAMESPACE_BEGIN
class DeviceQueue;
typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
const char *name,
int num,
void *user_ptr);
typedef void (*OneAPIDeviceIteratorCallback)(
const char *id, const char *name, int num, bool hwrt_support, void *user_ptr);
class OneapiDevice : public Device {
private:
SyclQueue *device_queue_;
# if WITH_EMBREE_GPU
RTCDevice embree_device;
RTCScene embree_scene;
# endif
using ConstMemMap = map<string, device_vector<uchar> *>;
ConstMemMap const_mem_map_;
device_vector<TextureInfo> texture_info_;
@ -34,17 +35,21 @@ class OneapiDevice : public Device {
size_t kg_memory_size_ = (size_t)0;
size_t max_memory_on_device_ = (size_t)0;
std::string oneapi_error_string_;
bool use_hardware_raytracing = false;
unsigned int kernel_features = 0;
public:
virtual BVHLayoutMask get_bvh_layout_mask() const override;
virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override;
OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
virtual ~OneapiDevice();
# if WITH_EMBREE_GPU
void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
# endif
bool check_peer_access(Device *peer_device) override;
bool load_kernels(const uint requested_features) override;
bool load_kernels(const uint kernel_features) override;
void load_texture_info();
@ -113,8 +118,9 @@ class OneapiDevice : public Device {
SyclQueue *sycl_queue();
protected:
bool can_use_hardware_raytracing_for_features(uint kernel_features) const;
void check_usm(SyclQueue *queue, const void *usm_ptr, bool allow_host);
bool create_queue(SyclQueue *&external_queue, int device_index);
bool create_queue(SyclQueue *&external_queue, int device_index, void *embree_device);
void free_queue(SyclQueue *queue);
void *usm_aligned_alloc_host(SyclQueue *queue, size_t memory_size, size_t alignment);
void *usm_alloc_device(SyclQueue *queue, size_t memory_size);

View File

@ -151,7 +151,7 @@ unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
return make_unique<OptiXDeviceQueue>(this);
}
BVHLayoutMask OptiXDevice::get_bvh_layout_mask() const
BVHLayoutMask OptiXDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
{
/* OptiX has its own internal acceleration structure format. */
return BVH_LAYOUT_OPTIX;

View File

@ -88,7 +88,7 @@ class OptiXDevice : public CUDADevice {
OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
~OptiXDevice();
BVHLayoutMask get_bvh_layout_mask() const override;
BVHLayoutMask get_bvh_layout_mask(uint /*kernel_features*/) const override;
string compile_kernel_get_common_cflags(const uint kernel_features);

View File

@ -299,8 +299,8 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
* become busy after adding new tiles). This is especially important for the shadow catcher which
* schedules work in halves of available number of paths. */
work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
work_tile_scheduler_.set_accelerated_rt((device_->get_bvh_layout_mask() & BVH_LAYOUT_OPTIX) !=
0);
work_tile_scheduler_.set_accelerated_rt(
(device_->get_bvh_layout_mask(device_scene_->data.kernel_features) & BVH_LAYOUT_OPTIX) != 0);
work_tile_scheduler_.reset(effective_buffer_params_,
start_sample,
samples_num,

View File

@ -96,10 +96,13 @@ set(SRC_KERNEL_DEVICE_ONEAPI_HEADERS
device/oneapi/compat.h
device/oneapi/context_begin.h
device/oneapi/context_end.h
device/oneapi/context_intersect_begin.h
device/oneapi/context_intersect_end.h
device/oneapi/globals.h
device/oneapi/image.h
device/oneapi/kernel.h
device/oneapi/kernel_templates.h
device/cpu/bvh.h
)
set(SRC_KERNEL_CLOSURE_HEADERS
@ -764,7 +767,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
# Set defaults for spir64 and spir64_gen options
if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-regular-grf-kernel integrator_intersect -ze-opt-large-grf-kernel shade -ze-opt-no-local-to-generic'")
endif()
if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen)
set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "${CYCLES_ONEAPI_SYCL_OPTIONS_spir64}" CACHE STRING "Extra build options for spir64_gen target")
@ -776,7 +779,7 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
# Host execution won't use GPU binaries, no need to compile them.
if(WITH_CYCLES_ONEAPI_BINARIES AND NOT WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
# AoT binaries aren't currently reused when calling sycl::build.
list(APPEND sycl_compiler_flags -DSYCL_SKIP_KERNELS_PRELOAD)
list(APPEND sycl_compiler_flags -DWITH_CYCLES_ONEAPI_BINARIES)
# Iterate over all targest and their options
list(JOIN CYCLES_ONEAPI_SYCL_TARGETS "," targets_string)
list(APPEND sycl_compiler_flags -fsycl-targets=${targets_string})
@ -798,6 +801,59 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
-I"${NANOVDB_INCLUDE_DIR}")
endif()
if(WITH_CYCLES_EMBREE AND EMBREE_SYCL_SUPPORT)
list(APPEND sycl_compiler_flags
-DWITH_EMBREE
-DWITH_EMBREE_GPU
-DEMBREE_MAJOR_VERSION=${EMBREE_MAJOR_VERSION}
-I"${EMBREE_INCLUDE_DIRS}")
if(WIN32)
list(APPEND sycl_compiler_flags
-ladvapi32.lib
)
endif()
set(next_library_mode "")
foreach(library ${EMBREE_LIBRARIES})
string(TOLOWER "${library}" library_lower)
if(("${library_lower}" STREQUAL "optimized") OR
("${library_lower}" STREQUAL "debug"))
set(next_library_mode "${library_lower}")
else()
if(next_library_mode STREQUAL "")
list(APPEND EMBREE_TBB_LIBRARIES_optimized ${library})
list(APPEND EMBREE_TBB_LIBRARIES_debug ${library})
else()
list(APPEND EMBREE_TBB_LIBRARIES_${next_library_mode} ${library})
endif()
set(next_library_mode "")
endif()
endforeach()
foreach(library ${TBB_LIBRARIES})
string(TOLOWER "${library}" library_lower)
if(("${library_lower}" STREQUAL "optimized") OR
("${library_lower}" STREQUAL "debug"))
set(next_library_mode "${library_lower}")
else()
if(next_library_mode STREQUAL "")
list(APPEND EMBREE_TBB_LIBRARIES_optimized ${library})
list(APPEND EMBREE_TBB_LIBRARIES_debug ${library})
else()
list(APPEND EMBREE_TBB_LIBRARIES_${next_library_mode} ${library})
endif()
set(next_library_mode "")
endif()
endforeach()
list(APPEND sycl_compiler_flags
"$<$<CONFIG:Release>:${EMBREE_TBB_LIBRARIES_optimized}>"
"$<$<CONFIG:RelWithDebInfo>:${EMBREE_TBB_LIBRARIES_optimized}>"
"$<$<CONFIG:MinSizeRel>:${EMBREE_TBB_LIBRARIES_optimized}>"
"$<$<CONFIG:Debug>:${EMBREE_TBB_LIBRARIES_debug}>"
)
endif()
if(WITH_CYCLES_DEBUG)
list(APPEND sycl_compiler_flags -DWITH_CYCLES_DEBUG)
endif()

View File

@ -21,6 +21,28 @@
# define __BVH2__
#endif
#if defined(__KERNEL_ONEAPI__) && defined(WITH_EMBREE_GPU)
/* bool is apparently not tested for specialization constants:
* https://github.com/intel/llvm/blob/39d1c65272a786b2b13a6f094facfddf9408406d/sycl/test/basic_tests/SYCL-2020-spec-constants.cpp#L25-L27
* Instead of adding one more bool specialization constant, we reuse existing embree_features one
* and use RTC_FEATURE_FLAG_NONE as value to test for avoiding to call Embree on GPU.
*/
/* We set it to RTC_FEATURE_FLAG_NONE by default so AoT binaries contain MNE and raytrace kernels
* precompiled without Embree.
* Changing this default value would require updating the logic in oneapi_load_kernels(). */
static constexpr sycl::specialization_id<RTCFeatureFlags> oneapi_embree_features{
RTC_FEATURE_FLAG_NONE};
# define IF_USING_EMBREE \
if (kernel_handler.get_specialization_constant<oneapi_embree_features>() != \
RTC_FEATURE_FLAG_NONE)
# define IF_NOT_USING_EMBREE \
if (kernel_handler.get_specialization_constant<oneapi_embree_features>() == \
xavierh marked this conversation as resolved

Maybe use if constexpr here to be really sure?

Maybe use `if constexpr` here to be really sure?
Review

the host compiler at least cannot consider it constexpr.

the host compiler at least cannot consider it constexpr.
RTC_FEATURE_FLAG_NONE)
#else
# define IF_USING_EMBREE
# define IF_NOT_USING_EMBREE
#endif
CCL_NAMESPACE_BEGIN
xavierh marked this conversation as resolved Outdated

We should not define __BVH2__ for OptiX and MetalRT (currently seems it is defined for all types of devices).

We should not define `__BVH2__` for OptiX and MetalRT (currently seems it is defined for all types of devices).

@xavierh, I as I see there is a BVH2 definion above:

#if defined(__EMBREE__)
#  include "kernel/device/cpu/bvh.h"
#  define __BVH2__
#elif defined(__METALRT__)
#  include "kernel/device/metal/bvh.h"
#elif defined(__KERNEL_OPTIX__)
#  include "kernel/device/optix/bvh.h"
#else
#  define __BVH2__
#endif

So do we really need __BVH2__ redefined again?

@xavierh, I as I see there is a __BVH2__ definion above: ```` #if defined(__EMBREE__) # include "kernel/device/cpu/bvh.h" # define __BVH2__ #elif defined(__METALRT__) # include "kernel/device/metal/bvh.h" #elif defined(__KERNEL_OPTIX__) # include "kernel/device/optix/bvh.h" #else # define __BVH2__ #endif ```` So do we really need `__BVH2__` redefined again?

good catch - that redefinition was definitely a mistake and it had no purpose, I've removed it.

good catch - that redefinition was definitely a mistake and it had no purpose, I've removed it.
#ifdef __BVH2__
@ -74,30 +96,39 @@ ccl_device_intersect bool scene_intersect(KernelGlobals kg,
}
# ifdef __EMBREE__
if (kernel_data.device_bvh) {
return kernel_embree_intersect(kg, ray, visibility, isect);
IF_USING_EMBREE
{
if (kernel_data.device_bvh) {
return kernel_embree_intersect(kg, ray, visibility, isect);
}
}
# endif
IF_NOT_USING_EMBREE
{
# ifdef __OBJECT_MOTION__
if (kernel_data.bvh.have_motion) {
if (kernel_data.bvh.have_motion) {
# ifdef __HAIR__
if (kernel_data.bvh.have_curves) {
return bvh_intersect_hair_motion(kg, ray, isect, visibility);
}
if (kernel_data.bvh.have_curves) {
return bvh_intersect_hair_motion(kg, ray, isect, visibility);
}
# endif /* __HAIR__ */
return bvh_intersect_motion(kg, ray, isect, visibility);
}
return bvh_intersect_motion(kg, ray, isect, visibility);
}
# endif /* __OBJECT_MOTION__ */
# ifdef __HAIR__
if (kernel_data.bvh.have_curves) {
return bvh_intersect_hair(kg, ray, isect, visibility);
}
if (kernel_data.bvh.have_curves) {
return bvh_intersect_hair(kg, ray, isect, visibility);
}
# endif /* __HAIR__ */
return bvh_intersect(kg, ray, isect, visibility);
return bvh_intersect(kg, ray, isect, visibility);
}
kernel_assert(false);
return false;
}
/* Single object BVH traversal, for SSS/AO/bevel. */
@ -129,17 +160,27 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals kg,
}
# ifdef __EMBREE__
if (kernel_data.device_bvh) {
return kernel_embree_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
IF_USING_EMBREE
{
if (kernel_data.device_bvh) {
return kernel_embree_intersect_local(
kg, ray, local_isect, local_object, lcg_state, max_hits);
}
}
# endif
IF_NOT_USING_EMBREE
{
# ifdef __OBJECT_MOTION__
if (kernel_data.bvh.have_motion) {
return bvh_intersect_local_motion(kg, ray, local_isect, local_object, lcg_state, max_hits);
}
if (kernel_data.bvh.have_motion) {
return bvh_intersect_local_motion(kg, ray, local_isect, local_object, lcg_state, max_hits);
}
# endif /* __OBJECT_MOTION__ */
return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
return bvh_intersect_local(kg, ray, local_isect, local_object, lcg_state, max_hits);
}
kernel_assert(false);
return false;
}
# endif
@ -184,35 +225,44 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals kg,
}
# ifdef __EMBREE__
if (kernel_data.device_bvh) {
return kernel_embree_intersect_shadow_all(
kg, state, ray, visibility, max_hits, num_recorded_hits, throughput);
IF_USING_EMBREE
{
if (kernel_data.device_bvh) {
return kernel_embree_intersect_shadow_all(
kg, state, ray, visibility, max_hits, num_recorded_hits, throughput);
}
}
# endif
IF_NOT_USING_EMBREE
{
# ifdef __OBJECT_MOTION__
if (kernel_data.bvh.have_motion) {
if (kernel_data.bvh.have_motion) {
# ifdef __HAIR__
if (kernel_data.bvh.have_curves) {
return bvh_intersect_shadow_all_hair_motion(
kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
}
if (kernel_data.bvh.have_curves) {
return bvh_intersect_shadow_all_hair_motion(
kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
}
# endif /* __HAIR__ */
return bvh_intersect_shadow_all_motion(
kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
}
return bvh_intersect_shadow_all_motion(
kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
}
# endif /* __OBJECT_MOTION__ */
# ifdef __HAIR__
if (kernel_data.bvh.have_curves) {
return bvh_intersect_shadow_all_hair(
kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
}
if (kernel_data.bvh.have_curves) {
return bvh_intersect_shadow_all_hair(
kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
}
# endif /* __HAIR__ */
return bvh_intersect_shadow_all(
kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
return bvh_intersect_shadow_all(
kg, ray, state, visibility, max_hits, num_recorded_hits, throughput);
}
kernel_assert(false);
return false;
}
# endif /* __SHADOW_RECORD_ALL__ */
@ -239,13 +289,19 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals kg,
return false;
}
IF_NOT_USING_EMBREE
{
# ifdef __OBJECT_MOTION__
if (kernel_data.bvh.have_motion) {
return bvh_intersect_volume_motion(kg, ray, isect, visibility);
}
if (kernel_data.bvh.have_motion) {
return bvh_intersect_volume_motion(kg, ray, isect, visibility);
}
# endif /* __OBJECT_MOTION__ */
return bvh_intersect_volume(kg, ray, isect, visibility);
return bvh_intersect_volume(kg, ray, isect, visibility);
}
kernel_assert(false);
return false;
}
# endif /* defined(__VOLUME__) && !defined(__VOLUME_RECORD_ALL__) */
@ -275,18 +331,27 @@ ccl_device_intersect uint scene_intersect_volume(KernelGlobals kg,
}
# ifdef __EMBREE__
if (kernel_data.device_bvh) {
return kernel_embree_intersect_volume(kg, ray, isect, max_hits, visibility);
IF_USING_EMBREE
{
if (kernel_data.device_bvh) {
return kernel_embree_intersect_volume(kg, ray, isect, max_hits, visibility);
}
}
# endif
IF_NOT_USING_EMBREE
{
# ifdef __OBJECT_MOTION__
if (kernel_data.bvh.have_motion) {
return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
}
if (kernel_data.bvh.have_motion) {
return bvh_intersect_volume_all_motion(kg, ray, isect, max_hits, visibility);
}
# endif /* __OBJECT_MOTION__ */
return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
return bvh_intersect_volume_all(kg, ray, isect, max_hits, visibility);
}
kernel_assert(false);
return false;
}
# endif /* defined(__VOLUME__) && defined(__VOLUME_RECORD_ALL__) */

View File

@ -13,8 +13,13 @@
# include <embree3/rtcore_scene.h>
#endif
#include "kernel/device/cpu/compat.h"
#include "kernel/device/cpu/globals.h"
#ifdef __KERNEL_ONEAPI__
# include "kernel/device/oneapi/compat.h"
# include "kernel/device/oneapi/globals.h"
#else
# include "kernel/device/cpu/compat.h"
# include "kernel/device/cpu/globals.h"
#endif
#include "kernel/bvh/types.h"
#include "kernel/bvh/util.h"
@ -33,11 +38,16 @@ using numhit_t = uint8_t;
using numhit_t = uint32_t;
#endif
#define CYCLES_EMBREE_USED_FEATURES \
(RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE | \
RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS | RTC_FEATURE_FLAG_POINT | \
RTC_FEATURE_FLAG_MOTION_BLUR | RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE | \
RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE)
#ifdef __KERNEL_ONEAPI__
# define CYCLES_EMBREE_USED_FEATURES \
(kernel_handler.get_specialization_constant<oneapi_embree_features>())
#else
# define CYCLES_EMBREE_USED_FEATURES \
(RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE | \
RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS | RTC_FEATURE_FLAG_POINT | \
RTC_FEATURE_FLAG_MOTION_BLUR | RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE | \
RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE)
#endif
#define EMBREE_IS_HAIR(x) (x & 1)
@ -252,7 +262,8 @@ ccl_device_inline void kernel_embree_convert_sss_hit(KernelGlobals kg,
* Things like recording subsurface or shadow hits for later evaluation
* as well as filtering for volume objects happen here.
* Cycles' own BVH does that directly inside the traversal calls. */
ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNArguments *args)
ccl_device_forceinline void kernel_embree_filter_intersection_func_impl(
const RTCFilterFunctionNArguments *args)
{
/* Current implementation in Cycles assumes only single-ray intersection queries. */
assert(args->N == 1);
@ -263,7 +274,11 @@ ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNA
#else
CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
#endif
#ifdef __KERNEL_ONEAPI__
KernelGlobalsGPU *kg = nullptr;
#else
const KernelGlobalsCPU *kg = ctx->kg;
#endif
const Ray *cray = ctx->ray;
if (kernel_embree_is_self_intersection(
@ -277,7 +292,7 @@ ccl_device void kernel_embree_filter_intersection_func(const RTCFilterFunctionNA
* as well as filtering for volume objects happen here.
* Cycles' own BVH does that directly inside the traversal calls.
*/
ccl_device void kernel_embree_filter_occluded_shadow_all_func(
ccl_device_forceinline void kernel_embree_filter_occluded_shadow_all_func_impl(
const RTCFilterFunctionNArguments *args)
{
/* Current implementation in Cycles assumes only single-ray intersection queries. */
@ -290,7 +305,11 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func(
#else
CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
#endif
#ifdef __KERNEL_ONEAPI__
KernelGlobalsGPU *kg = nullptr;
#else
const KernelGlobalsCPU *kg = ctx->kg;
#endif
const Ray *cray = ctx->ray;
Intersection current_isect;
@ -326,7 +345,7 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func(
}
/* Test if we need to record this transparent intersection. */
const numhit_t max_record_hits = min(ctx->max_hits, INTEGRATOR_SHADOW_ISECT_SIZE);
const numhit_t max_record_hits = min(ctx->max_hits, numhit_t(INTEGRATOR_SHADOW_ISECT_SIZE));
if (ctx->num_recorded_hits < max_record_hits) {
/* If maximum number of hits was reached, replace the intersection with the
* highest distance. We want to find the N closest intersections. */
@ -363,7 +382,7 @@ ccl_device void kernel_embree_filter_occluded_shadow_all_func(
*args->valid = 0;
}
ccl_device_forceinline void kernel_embree_filter_occluded_local_func(
ccl_device_forceinline void kernel_embree_filter_occluded_local_func_impl(
const RTCFilterFunctionNArguments *args)
{
/* Current implementation in Cycles assumes only single-ray intersection queries. */
@ -376,7 +395,11 @@ ccl_device_forceinline void kernel_embree_filter_occluded_local_func(
#else
CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
#endif
#ifdef __KERNEL_ONEAPI__
KernelGlobalsGPU *kg = nullptr;
#else
const KernelGlobalsCPU *kg = ctx->kg;
#endif
const Ray *cray = ctx->ray;
/* Check if it's hitting the correct object. */
@ -462,7 +485,7 @@ ccl_device_forceinline void kernel_embree_filter_occluded_local_func(
*args->valid = 0;
}
ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func(
ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func_impl(
const RTCFilterFunctionNArguments *args)
{
/* Current implementation in Cycles assumes only single-ray intersection queries. */
@ -475,7 +498,11 @@ ccl_device_forceinline void kernel_embree_filter_occluded_volume_all_func(
#else
CCLIntersectContext *ctx = (CCLIntersectContext *)(args->context);
#endif
#ifdef __KERNEL_ONEAPI__
KernelGlobalsGPU *kg = nullptr;
#else
const KernelGlobalsCPU *kg = ctx->kg;
#endif
const Ray *cray = ctx->ray;
/* Append the intersection to the end of the array. */
@ -513,14 +540,14 @@ ccl_device_forceinline void kernel_embree_filter_occluded_func(
switch (ctx->type) {
case CCLIntersectContext::RAY_SHADOW_ALL:
kernel_embree_filter_occluded_shadow_all_func(args);
kernel_embree_filter_occluded_shadow_all_func_impl(args);
break;
case CCLIntersectContext::RAY_LOCAL:
case CCLIntersectContext::RAY_SSS:
kernel_embree_filter_occluded_local_func(args);
kernel_embree_filter_occluded_local_func_impl(args);
break;
case CCLIntersectContext::RAY_VOLUME_ALL:
kernel_embree_filter_occluded_volume_all_func(args);
kernel_embree_filter_occluded_volume_all_func_impl(args);
break;
case CCLIntersectContext::RAY_REGULAR:
@ -569,7 +596,63 @@ ccl_device void kernel_embree_filter_occluded_func_backface_cull(
kernel_embree_filter_occluded_func(args);
}
#endif
#ifdef __KERNEL_ONEAPI__
/* Static wrappers so we can call the callbacks from out side the ONEAPIKernelContext class */
RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
kernel_embree_filter_intersection_func_static(const RTCFilterFunctionNArguments *args)
{
RTCHit *hit = (RTCHit *)args->hit;
CCLFirstHitContext *ctx = (CCLFirstHitContext *)(args->context);
ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
context->kernel_embree_filter_intersection_func_impl(args);
}
RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
kernel_embree_filter_occluded_shadow_all_func_static(const RTCFilterFunctionNArguments *args)
{
RTCHit *hit = (RTCHit *)args->hit;
CCLShadowContext *ctx = (CCLShadowContext *)(args->context);
ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
context->kernel_embree_filter_occluded_shadow_all_func_impl(args);
}
RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
kernel_embree_filter_occluded_local_func_static(const RTCFilterFunctionNArguments *args)
{
RTCHit *hit = (RTCHit *)args->hit;
CCLLocalContext *ctx = (CCLLocalContext *)(args->context);
ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
context->kernel_embree_filter_occluded_local_func_impl(args);
}
RTC_SYCL_INDIRECTLY_CALLABLE static void ccl_always_inline
kernel_embree_filter_occluded_volume_all_func_static(const RTCFilterFunctionNArguments *args)
{
RTCHit *hit = (RTCHit *)args->hit;
CCLVolumeContext *ctx = (CCLVolumeContext *)(args->context);
ONEAPIKernelContext *context = static_cast<ONEAPIKernelContext *>(ctx->kg);
context->kernel_embree_filter_occluded_volume_all_func_impl(args);
}
# define kernel_embree_filter_intersection_func \
ONEAPIKernelContext::kernel_embree_filter_intersection_func_static
# define kernel_embree_filter_occluded_shadow_all_func \
ONEAPIKernelContext::kernel_embree_filter_occluded_shadow_all_func_static
# define kernel_embree_filter_occluded_local_func \
ONEAPIKernelContext::kernel_embree_filter_occluded_local_func_static
# define kernel_embree_filter_occluded_volume_all_func \
ONEAPIKernelContext::kernel_embree_filter_occluded_volume_all_func_static
#else
# define kernel_embree_filter_intersection_func kernel_embree_filter_intersection_func_impl
# if EMBREE_MAJOR_VERSION >= 4
# define kernel_embree_filter_occluded_shadow_all_func \
kernel_embree_filter_occluded_shadow_all_func_impl
# define kernel_embree_filter_occluded_local_func kernel_embree_filter_occluded_local_func_impl
# define kernel_embree_filter_occluded_volume_all_func \
kernel_embree_filter_occluded_volume_all_func_impl
# endif
#endif
/* Scene intersection. */
@ -583,7 +666,15 @@ ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg,
#if EMBREE_MAJOR_VERSION >= 4
CCLFirstHitContext ctx;
rtcInitRayQueryContext(&ctx);
# ifdef __KERNEL_ONEAPI__
/* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and
* uses global device allocation (CUDA, Optix, HIP) or passes all needed data
* as a class context (Metal, oneAPI). So we need to pass this context here
* in order to have an access to it later in Embree filter functions on GPU. */
ctx.kg = (KernelGlobals)this;
# else
ctx.kg = kg;
# endif
#else
CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_REGULAR);
rtcInitIntersectContext(&ctx);
@ -596,7 +687,7 @@ ccl_device_intersect bool kernel_embree_intersect(KernelGlobals kg,
#if EMBREE_MAJOR_VERSION >= 4
RTCIntersectArguments args;
rtcInitIntersectArguments(&args);
args.filter = (RTCFilterFunctionN)kernel_embree_filter_intersection_func;
args.filter = reinterpret_cast<RTCFilterFunctionN>(kernel_embree_filter_intersection_func);
args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
args.context = &ctx;
rtcIntersect1(kernel_data.device_bvh, &ray_hit, &args);
@ -625,7 +716,15 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
# if EMBREE_MAJOR_VERSION >= 4
CCLLocalContext ctx;
rtcInitRayQueryContext(&ctx);
# ifdef __KERNEL_ONEAPI__
/* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and
* uses global device allocation (CUDA, Optix, HIP) or passes all needed data
* as a class context (Metal, oneAPI). So we need to pass this context here
* in order to have an access to it later in Embree filter functions on GPU. */
ctx.kg = (KernelGlobals)this;
# else
ctx.kg = kg;
# endif
# else
CCLIntersectContext ctx(kg,
has_bvh ? CCLIntersectContext::RAY_SSS : CCLIntersectContext::RAY_LOCAL);
@ -646,7 +745,7 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
# if EMBREE_MAJOR_VERSION >= 4
RTCOccludedArguments args;
rtcInitOccludedArguments(&args);
args.filter = (RTCFilterFunctionN)(kernel_embree_filter_occluded_local_func);
args.filter = reinterpret_cast<RTCFilterFunctionN>(kernel_embree_filter_occluded_local_func);
args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
args.context = &ctx;
# endif
@ -692,7 +791,7 @@ ccl_device_intersect bool kernel_embree_intersect_local(KernelGlobals kg,
#ifdef __SHADOW_RECORD_ALL__
ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
IntegratorShadowStateCPU *state,
IntegratorShadowState state,
ccl_private const Ray *ray,
uint visibility,
uint max_hits,
@ -702,7 +801,15 @@ ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
# if EMBREE_MAJOR_VERSION >= 4
CCLShadowContext ctx;
rtcInitRayQueryContext(&ctx);
# ifdef __KERNEL_ONEAPI__
/* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and
* uses global device allocation (CUDA, Optix, HIP) or passes all needed data
* as a class context (Metal, oneAPI). So we need to pass this context here
* in order to have an access to it later in Embree filter functions on GPU. */
ctx.kg = (KernelGlobals)this;
# else
ctx.kg = kg;
# endif
# else
CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_SHADOW_ALL);
rtcInitIntersectContext(&ctx);
@ -718,7 +825,8 @@ ccl_device_intersect bool kernel_embree_intersect_shadow_all(KernelGlobals kg,
# if EMBREE_MAJOR_VERSION >= 4
RTCOccludedArguments args;
rtcInitOccludedArguments(&args);
args.filter = (RTCFilterFunctionN)kernel_embree_filter_occluded_shadow_all_func;
args.filter = reinterpret_cast<RTCFilterFunctionN>(
kernel_embree_filter_occluded_shadow_all_func);
args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
args.context = &ctx;
rtcOccluded1(kernel_data.device_bvh, &rtc_ray, &args);
@ -742,7 +850,15 @@ ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg,
# if EMBREE_MAJOR_VERSION >= 4
CCLVolumeContext ctx;
rtcInitRayQueryContext(&ctx);
# ifdef __KERNEL_ONEAPI__
/* NOTE(sirgienko) Cycles GPU backends passes NULL to KernelGlobals and
* uses global device allocation (CUDA, Optix, HIP) or passes all needed data
* as a class context (Metal, oneAPI). So we need to pass this context here
* in order to have an access to it later in Embree filter functions on GPU. */
ctx.kg = (KernelGlobals)this;
# else
ctx.kg = kg;
# endif
# else
CCLIntersectContext ctx(kg, CCLIntersectContext::RAY_VOLUME_ALL);
rtcInitIntersectContext(&ctx);
@ -756,7 +872,8 @@ ccl_device_intersect uint kernel_embree_intersect_volume(KernelGlobals kg,
# if EMBREE_MAJOR_VERSION >= 4
RTCOccludedArguments args;
rtcInitOccludedArguments(&args);
args.filter = (RTCFilterFunctionN)kernel_embree_filter_occluded_volume_all_func;
args.filter = reinterpret_cast<RTCFilterFunctionN>(
kernel_embree_filter_occluded_volume_all_func);
args.feature_mask = CYCLES_EMBREE_USED_FEATURES;
args.context = &ctx;
rtcOccluded1(kernel_data.device_bvh, &rtc_ray, &args);

View File

@ -128,6 +128,12 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel_postfix
/* Intersection kernels need access to the kernel handler for specialization constants to work
* properly. */
#ifdef __KERNEL_ONEAPI__
# include "kernel/device/oneapi/context_intersect_begin.h"
#endif
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
ccl_gpu_kernel_signature(integrator_intersect_closest,
ccl_global const int *path_index_array,
@ -185,6 +191,10 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
ccl_gpu_kernel_postfix
#ifdef __KERNEL_ONEAPI__
# include "kernel/device/oneapi/context_intersect_end.h"
#endif
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
ccl_gpu_kernel_signature(integrator_shade_background,
ccl_global const int *path_index_array,
@ -249,6 +259,12 @@ ccl_gpu_kernel_postfix
constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
#endif
/* Kernels using intersections need access to the kernel handler for specialization constants to
* work properly. */
#ifdef __KERNEL_ONEAPI__
# include "kernel/device/oneapi/context_intersect_begin.h"
#endif
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
ccl_gpu_kernel_signature(integrator_shade_surface_raytrace,
ccl_global const int *path_index_array,
@ -287,6 +303,9 @@ ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
}
}
ccl_gpu_kernel_postfix
#ifdef __KERNEL_ONEAPI__
# include "kernel/device/oneapi/context_intersect_end.h"
#endif
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS)
ccl_gpu_kernel_signature(integrator_shade_volume,

View File

@ -5,6 +5,11 @@
#define __KERNEL_GPU__
#define __KERNEL_ONEAPI__
#define __KERNEL_64_BIT__
#ifdef WITH_EMBREE_GPU
# define __KERNEL_GPU_RAYTRACING__
#endif
#define CCL_NAMESPACE_BEGIN

What is this for, it doesn't seem to be used by our own code? If it's needed, can you add a comment explaining what this is?

What is this for, it doesn't seem to be used by our own code? If it's needed, can you add a comment explaining what this is?

it was at some point in Embree but it's gone, I'll remove it.

it was at some point in Embree but it's gone, I'll remove it.
#define CCL_NAMESPACE_END
@ -57,17 +62,19 @@
#define ccl_gpu_kernel_threads(block_num_threads)
#ifndef WITH_ONEAPI_SYCL_HOST_TASK
# define ccl_gpu_kernel_signature(name, ...) \
# define __ccl_gpu_kernel_signature(name, ...) \
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
size_t kernel_global_size, \
size_t kernel_local_size, \
sycl::handler &cgh, \
__VA_ARGS__) { \
(kg); \
cgh.parallel_for<class kernel_##name>( \
cgh.parallel_for( \
sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
[=](sycl::nd_item<1> item) {
# define ccl_gpu_kernel_signature __ccl_gpu_kernel_signature
# define ccl_gpu_kernel_postfix \
}); \
}

View File

@ -0,0 +1,18 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2023 Intel Corporation */
#if !defined(WITH_ONEAPI_SYCL_HOST_TASK) && defined(WITH_EMBREE_GPU)
# undef ccl_gpu_kernel_signature
# define ccl_gpu_kernel_signature(name, ...) \
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
size_t kernel_global_size, \
size_t kernel_local_size, \
sycl::handler &cgh, \
__VA_ARGS__) \
{ \
(kg); \
cgh.parallel_for( \
sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
[=](sycl::nd_item<1> item, sycl::kernel_handler oneapi_kernel_handler) { \
((ONEAPIKernelContext*)kg)->kernel_handler = oneapi_kernel_handler;
#endif

View File

@ -0,0 +1,7 @@
/* SPDX-License-Identifier: Apache-2.0
* Copyright 2023 Intel Corporation */
#if !defined(WITH_ONEAPI_SYCL_HOST_TASK) && defined(WITH_EMBREE_GPU)
# undef ccl_gpu_kernel_signature
# define ccl_gpu_kernel_signature __ccl_gpu_kernel_signature
#endif

View File

@ -31,6 +31,8 @@ typedef struct KernelGlobalsGPU {
size_t nd_item_group_range_0;
size_t nd_item_global_id_0;
size_t nd_item_global_range_0;
#else
sycl::kernel_handler kernel_handler;
#endif
} KernelGlobalsGPU;

View File

@ -16,9 +16,22 @@
# include "kernel/device/gpu/kernel.h"
# include "device/kernel.cpp"
static OneAPIErrorCallback s_error_cb = nullptr;
static void *s_error_user_ptr = nullptr;
# ifdef WITH_EMBREE_GPU
static const RTCFeatureFlags CYCLES_ONEAPI_EMBREE_BASIC_FEATURES =
(const RTCFeatureFlags)(RTC_FEATURE_FLAG_TRIANGLE | RTC_FEATURE_FLAG_INSTANCE |
RTC_FEATURE_FLAG_FILTER_FUNCTION_IN_ARGUMENTS |
RTC_FEATURE_FLAG_POINT | RTC_FEATURE_FLAG_MOTION_BLUR);
static const RTCFeatureFlags CYCLES_ONEAPI_EMBREE_ALL_FEATURES =
(const RTCFeatureFlags)(CYCLES_ONEAPI_EMBREE_BASIC_FEATURES |
RTC_FEATURE_FLAG_ROUND_CATMULL_ROM_CURVE |
RTC_FEATURE_FLAG_FLAT_CATMULL_ROM_CURVE);
# endif
void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr)
{
s_error_cb = cb;
@ -142,15 +155,96 @@ size_t oneapi_kernel_preferred_local_size(SyclQueue *queue,
return std::min(limit_work_group_size, preferred_work_group_size);
}
bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
bool oneapi_kernel_is_required_for_features(const std::string &kernel_name,
const uint kernel_features)
{
if ((kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0 &&
kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE)) !=
std::string::npos)
return false;
if ((kernel_features & KERNEL_FEATURE_MNEE) == 0 &&
kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE)) !=
std::string::npos)

Can you use kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE)) and similar instead?

Can you use `kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE))` and similar instead?
return false;
if ((kernel_features & KERNEL_FEATURE_VOLUME) == 0 &&
kernel_name.find(device_kernel_as_string(DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK)) !=
std::string::npos)
return false;
return true;
}
bool oneapi_kernel_is_using_embree(const std::string &kernel_name)
{
# ifdef WITH_EMBREE_GPU
/* MNEE and Raytrace kernels aren't yet enabled to use Embree. */
for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {

Can make this more generic:

for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
  DeviceKernel kernel = (DeviceKernel)i;
  if (device_kernel_has_intersection(kernel)) {
    if (kernel_name.find(device_kernel_as_string(kernel)) != std::string::npos) {
      return !(kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
                         DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);
    }
  }
}
Can make this more generic: ``` for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) { DeviceKernel kernel = (DeviceKernel)i; if (device_kernel_has_intersection(kernel)) { if (kernel_name.find(device_kernel_as_string(kernel)) != std::string::npos) { return !(kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE || DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE); } } } ```
DeviceKernel kernel = (DeviceKernel)i;
if (device_kernel_has_intersection(kernel)) {
if (kernel_name.find(device_kernel_as_string(kernel)) != std::string::npos) {
return !(kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE ||
kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE);

Replace by device_kernel_has_intersection which does the same thing.

Replace by `device_kernel_has_intersection` which does the same thing.
}
}
}
# endif
return false;
}
bool oneapi_load_kernels(SyclQueue *queue_,
const uint kernel_features,
bool use_hardware_raytracing)
{
# ifdef SYCL_SKIP_KERNELS_PRELOAD
(void)queue_;
(void)requested_features;
# else
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
# ifdef WITH_EMBREE_GPU
/* For best performance, we always JIT compile the kernels that are using Embree. */
if (use_hardware_raytracing) {
try {
sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
{queue->get_device()});
for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) {
const std::string &kernel_name = kernel_id.get_name();
if (!oneapi_kernel_is_required_for_features(kernel_name, kernel_features) ||
!oneapi_kernel_is_using_embree(kernel_name)) {
continue;
}
sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle_input =
sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
/* Hair requires embree curves support. */
if (kernel_features & KERNEL_FEATURE_HAIR) {
one_kernel_bundle_input
.set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
CYCLES_ONEAPI_EMBREE_ALL_FEATURES);
sycl::build(one_kernel_bundle_input);
}
else {
one_kernel_bundle_input
.set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
CYCLES_ONEAPI_EMBREE_BASIC_FEATURES);
sycl::build(one_kernel_bundle_input);
}
}
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
}
# endif
# ifdef WITH_CYCLES_ONEAPI_BINARIES
(void)queue_;
(void)kernel_features;
# else
try {
sycl::kernel_bundle<sycl::bundle_state::input> all_kernels_bundle =
sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(),
@ -159,27 +253,25 @@ bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
for (const sycl::kernel_id &kernel_id : all_kernels_bundle.get_kernel_ids()) {
const std::string &kernel_name = kernel_id.get_name();
/* NOTE(@nsirgien): Names in this conditions below should match names from
* oneapi_call macro in oneapi_enqueue_kernel below */
if (((requested_features & KERNEL_FEATURE_VOLUME) == 0) &&
kernel_name.find("oneapi_kernel_integrator_shade_volume") != std::string::npos) {
/* In case HWRT is on, compilation of kernels using Embree is already handled in previous
* block. */
if (!oneapi_kernel_is_required_for_features(kernel_name, kernel_features) ||
(use_hardware_raytracing && oneapi_kernel_is_using_embree(kernel_name))) {
continue;
}
if (((requested_features & KERNEL_FEATURE_MNEE) == 0) &&
kernel_name.find("oneapi_kernel_integrator_shade_surface_mnee") != std::string::npos) {
continue;
}
if (((requested_features & KERNEL_FEATURE_NODE_RAYTRACE) == 0) &&
kernel_name.find("oneapi_kernel_integrator_shade_surface_raytrace") !=
std::string::npos) {
continue;
}
sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle =
sycl::kernel_bundle<sycl::bundle_state::input> one_kernel_bundle_input =
sycl::get_kernel_bundle<sycl::bundle_state::input>(queue->get_context(), {kernel_id});
sycl::build(one_kernel_bundle);
# ifdef WITH_EMBREE_GPU
/* This is expected to be the default, we set it again to be sure. */
if (one_kernel_bundle_input
.has_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>()) {
one_kernel_bundle_input
.set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
RTC_FEATURE_FLAG_NONE);
}
# endif
sycl::build(one_kernel_bundle_input);
}
}
catch (sycl::exception const &e) {
@ -195,6 +287,8 @@ bool oneapi_load_kernels(SyclQueue *queue_, const uint requested_features)
bool oneapi_enqueue_kernel(KernelContext *kernel_context,
int kernel,
size_t global_size,
const uint kernel_features,
bool use_hardware_raytracing,
void **args)
{
bool success = true;
@ -248,6 +342,21 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
try {
queue->submit([&](sycl::handler &cgh) {
# ifdef WITH_EMBREE_GPU
/* Spec says it has no effect if the called kernel doesn't support the below specialization
* constant but it can still trigger a recompilation, so we set it only if needed. */
if (device_kernel_has_intersection(device_kernel)) {
const RTCFeatureFlags used_embree_features = !use_hardware_raytracing ?
RTC_FEATURE_FLAG_NONE :
!(kernel_features & KERNEL_FEATURE_HAIR) ?
CYCLES_ONEAPI_EMBREE_BASIC_FEATURES :
CYCLES_ONEAPI_EMBREE_ALL_FEATURES;
cgh.set_specialization_constant<ONEAPIKernelContext::oneapi_embree_features>(
used_embree_features);
}
# else
(void)kernel_features;
# endif
switch (device_kernel) {
case DEVICE_KERNEL_INTEGRATOR_RESET: {
oneapi_call(kg, cgh, global_size, local_size, args, oneapi_kernel_integrator_reset);
@ -549,4 +658,5 @@ bool oneapi_enqueue_kernel(KernelContext *kernel_context,
# endif
return success;
}
#endif /* WITH_ONEAPI */

View File

@ -47,10 +47,14 @@ CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_kernel_preferred_local_size(
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,
int kernel,
size_t global_size,
const unsigned int kernel_features,
bool use_hardware_raytracing,
void **args);
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_load_kernels(SyclQueue *queue,
const unsigned int requested_features);
const unsigned int kernel_features,
bool use_hardware_raytracing);
# ifdef __cplusplus
}
# endif
#endif /* WITH_ONEAPI */

View File

@ -3,8 +3,9 @@
#pragma once
#if !defined(__KERNEL_GPU__) && defined(WITH_EMBREE)
# if EMBREE_MAJOR_VERSION >= 4
#if (!defined(__KERNEL_GPU__) || (defined(__KERNEL_ONEAPI__) && defined(WITH_EMBREE_GPU))) && \
defined(WITH_EMBREE)
# if EMBREE_MAJOR_VERSION == 4
# include <embree4/rtcore.h>
# include <embree4/rtcore_scene.h>
# else

View File

@ -194,8 +194,8 @@ void Geometry::compute_bvh(Device *device,
compute_bounds();
const BVHLayout bvh_layout = BVHParams::best_bvh_layout(params->bvh_layout,
device->get_bvh_layout_mask());
const BVHLayout bvh_layout = BVHParams::best_bvh_layout(
params->bvh_layout, device->get_bvh_layout_mask(dscene->data.kernel_features));
if (need_build_bvh(bvh_layout)) {
string msg = "Updating Geometry BVH ";
if (name.empty())
@ -1235,8 +1235,8 @@ void GeometryManager::device_update_bvh(Device *device,
BVHParams bparams;
bparams.top_level = true;
bparams.bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
device->get_bvh_layout_mask());
bparams.bvh_layout = BVHParams::best_bvh_layout(
scene->params.bvh_layout, device->get_bvh_layout_mask(dscene->data.kernel_features));
bparams.use_spatial_split = scene->params.use_bvh_spatial_split;
bparams.use_unaligned_nodes = dscene->data.bvh.have_curves &&
scene->params.use_bvh_unaligned_nodes;
@ -1889,8 +1889,8 @@ void GeometryManager::device_update(Device *device,
/* Device update. */
device_free(device, dscene, false);
const BVHLayout bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
device->get_bvh_layout_mask());
const BVHLayout bvh_layout = BVHParams::best_bvh_layout(
scene->params.bvh_layout, device->get_bvh_layout_mask(dscene->data.kernel_features));
geom_calc_offset(scene, bvh_layout);
if (true_displacement_used || curve_shadow_transparency_used) {
scoped_callback_timer timer([scene](double time) {
@ -2051,8 +2051,8 @@ void GeometryManager::device_update(Device *device,
/* Always set BVH layout again after displacement where it was set to none,
* to avoid ray-tracing at that stage. */
dscene->data.bvh.bvh_layout = BVHParams::best_bvh_layout(scene->params.bvh_layout,
device->get_bvh_layout_mask());
dscene->data.bvh.bvh_layout = BVHParams::best_bvh_layout(
scene->params.bvh_layout, device->get_bvh_layout_mask(dscene->data.kernel_features));
{
scoped_callback_timer timer([scene](double time) {

View File

@ -595,7 +595,7 @@ void ObjectManager::device_update_object_transform(UpdateObjectTransformState *s
void ObjectManager::device_update_prim_offsets(Device *device, DeviceScene *dscene, Scene *scene)
{
if (!scene->integrator->get_use_light_tree()) {
BVHLayoutMask layout_mask = device->get_bvh_layout_mask();
BVHLayoutMask layout_mask = device->get_bvh_layout_mask(dscene->data.kernel_features);
if (layout_mask != BVH_LAYOUT_METAL && layout_mask != BVH_LAYOUT_MULTI_METAL &&
layout_mask != BVH_LAYOUT_MULTI_METAL_EMBREE) {
return;

View File

@ -4,7 +4,6 @@
#ifndef __UTIL_VECTOR_H__
#define __UTIL_VECTOR_H__
#include <cassert>
#include <cstring>
#include <vector>