Nodes: new interactive operator to slide nodes #121981

Open
Jacques Lucke wants to merge 24 commits from JacquesLucke/blender:slide-nodes into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
721 changed files with 81928 additions and 11203 deletions
Showing only changes of commit 384001012e - Show all commits

View File

@ -1,5 +1,5 @@
name: Bug Report
about: File a bug report
about: Use Help > Report a Bug from the top of Blender to automatically fill out part of this form.
labels:
- "Type/Report"
- "Status/Needs Triage"
@ -11,7 +11,7 @@ body:
### Instructions
First time reporting? See [tips](https://developer.blender.org/docs/handbook/bug_reports/making_good_bug_reports/).
* Use **Help > Report a Bug** in Blender to fill system information and exact Blender version.
* Use **Help > Report a Bug** from the top of Blender to fill system information and exact Blender version.
* Test [daily builds](https://builder.blender.org/) to verify if the issue is already fixed.
* Test [previous versions](https://download.blender.org/release/) to find an older working version.
* For feature requests, feedback, questions or build issues, see [communication channels](https://developer.blender.org/docs/handbook/communication/user_feedback/).

21
.github/stale.yml vendored
View File

@ -1,21 +0,0 @@
# Configuration for probot-stale - https://github.com/probot/stale
# This file is used on Blender's GitHub mirror to automatically close any pull request
# and invite contributors to join the official development platform on blender.org
# Number of days of inactivity before an Issue or Pull Request becomes stale
daysUntilStale: 1
# Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
# Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
daysUntilClose: 1
# Label to use when marking as stale
staleLabel: stale
# Comment to post when closing a stale Issue or Pull Request.
closeComment: >
This issue has been automatically closed, because this repository is only
used as a mirror. Blender development happens on projects.blender.org.
To get started contributing code, please read:
https://developer.blender.org/docs/handbook/contributing/

32
.github/workflows/stale.yml vendored Normal file
View File

@ -0,0 +1,32 @@
# GitHub Actions workflow for automatically closing pull requests
# This workflow is specific to Blender's GitHub mirror and directs contributors to the official development platform on blender.org
name: Close GitHub Pull Requests
# Trigger this workflow every 12 hours.
on:
schedule:
- cron: '* */12 * * *'
jobs:
close_prs:
name: Close Pull Requests
runs-on: ubuntu-latest
# Only run this job in the read-only mirror repository.
if: github.repository == 'blender/blender' && contains(github.server_url, 'github.com')
# Permissions granted to the GitHub Actions bot.
permissions:
pull-requests: write
steps:
- uses: actions/stale@v9
with:
# Number of days before a pull request is marked as stale.
days-before-pr-stale: 0
# Number of days before a pull request is closed.
days-before-pr-close: 0
# Message posted when closing a pull request.
stale-pr-message: |
This pull request has been automatically closed because this repository is a read-only mirror. Blender development happens on [projects.blender.org](https://projects.blender.org).
To contribute code, please read:
https://developer.blender.org/docs/handbook/contributing/

View File

@ -18,12 +18,16 @@ macro(fftw_build FFTW_POSTFIX)
CMAKE_ARGS
-DCMAKE_INSTALL_PREFIX=${LIBDIR}/fftw3
-DENABLE_THREADS=ON
-DWITH_COMBINED_THREADS=OFF
-DBUILD_SHARED_LIBS=OFF
-DBUILD_TESTS=OFF
${FFTW_EXTRA_ARGS}
INSTALL_DIR ${LIBDIR}/fftw3
)
else()
set(FFTW_EXTRA_ARGS --enable-static)
set(FFTW_EXTRA_ARGS --enable-static --enable-threads)
set(FFTW_INSTALL install)
ExternalProject_Add(external_fftw3_${FFTW_POSTFIX}
URL file://${PACKAGE_DIR}/${FFTW_FILE}
@ -57,12 +61,12 @@ if(MSVC)
COMMAND ${CMAKE_COMMAND} -E copy
${LIBDIR}/fftw3/lib/fftw3.lib
${HARVEST_TARGET}/fftw3/lib/fftw3.lib
COMMAND ${CMAKE_COMMAND} -E copy
${LIBDIR}/fftw3/bin/fftw3.dll
${HARVEST_TARGET}/fftw3/lib/fftw3.dll
COMMAND ${CMAKE_COMMAND} -E copy
${LIBDIR}/fftw3/include/fftw3.h
${HARVEST_TARGET}/fftw3/include/fftw3.h
COMMAND ${CMAKE_COMMAND} -E copy
${LIBDIR}/fftw3/lib/fftw3_threads.lib
${HARVEST_TARGET}/fftw3/lib/fftw3_threads.lib
DEPENDEES install
)
ExternalProject_Add_Step(external_fftw3_float after_install
@ -70,8 +74,8 @@ if(MSVC)
${LIBDIR}/fftw3/lib/fftw3f.lib
${HARVEST_TARGET}/fftw3/lib/fftw3f.lib
COMMAND ${CMAKE_COMMAND} -E copy
${LIBDIR}/fftw3/bin/fftw3f.dll
${HARVEST_TARGET}/fftw3/lib/fftw3f.dll
${LIBDIR}/fftw3/lib/fftw3f_threads.lib
${HARVEST_TARGET}/fftw3/lib/fftw3f_threads.lib
DEPENDEES install
)
endif()

View File

@ -73,3 +73,13 @@ index ae14ced..a49e131 100644
else
return HIPArch::Unknown;
}
--- a/devices/cpu/cpu_engine.h
+++ b/devices/cpu/cpu_engine.h
@@ -7,5 +7,7 @@
#include "cpu_device.h"
#include <queue>
#include <condition_variable>
+// BLENDER: needed for building on Linux.
+#include <thread>
OIDN_NAMESPACE_BEGIN

View File

@ -440,12 +440,16 @@ endif()
if(WITH_FFTW3)
set(FFTW3 ${LIBDIR}/fftw3)
if(EXISTS ${FFTW3}/lib/libfftw3-3.lib) # 3.6 libraries
set(FFTW3_LIBRARIES ${FFTW3}/lib/libfftw3-3.lib ${FFTW3}/lib/libfftw3f.lib)
elseif(EXISTS ${FFTW3}/lib/libfftw.lib)
set(FFTW3_LIBRARIES ${FFTW3}/lib/libfftw.lib) # 3.5 Libraries
else()
set(FFTW3_LIBRARIES ${FFTW3}/lib/fftw3.lib ${FFTW3}/lib/fftw3f.lib) # msys2+MSVC Libraries
set(FFTW3_LIBRARIES
${FFTW3}/lib/fftw3.lib
${FFTW3}/lib/fftw3f.lib
)
if(EXISTS ${FFTW3}/lib/fftw3_threads.lib)
list(APPEND FFTW3_LIBRARIES
${FFTW3}/lib/fftw3_threads.lib
${FFTW3}/lib/fftw3f_threads.lib
)
set(WITH_FFTW3_THREADS_SUPPORT ON)
endif()
set(FFTW3_INCLUDE_DIRS ${FFTW3}/include)
set(FFTW3_LIBPATH ${FFTW3}/lib)

View File

@ -183,8 +183,12 @@ def main() -> None:
# Set platform tag following conventions.
if sys.platform == "darwin":
target = cmake_cache_var_or_exit(filepath_cmake_cache, "CMAKE_OSX_DEPLOYMENT_TARGET").split(".")
# Minor version is expected to be always zero starting with macOS 11.
# https://github.com/pypa/packaging/issues/435
target_major = int(target[0])
target_minor = 0 # int(target[1])
machine = cmake_cache_var_or_exit(filepath_cmake_cache, "CMAKE_OSX_ARCHITECTURES")
platform_tag = "macosx_%d_%d_%s" % (int(target[0]), int(target[1]), machine)
platform_tag = "macosx_%d_%d_%s" % (target_major, target_minor, machine)
elif sys.platform == "win32":
platform_tag = "win_%s" % (platform.machine().lower())
elif sys.platform == "linux":
@ -198,6 +202,10 @@ def main() -> None:
sys.stderr.write("Unsupported platform: %s, abort!\n" % (sys.platform))
sys.exit(1)
# Manually specify, otherwise it uses the version of the executable used to run
# this script which may not match the Blender python version.
python_tag = "py%d%d" % (python_version_number[0], python_version_number[1])
os.chdir(install_dir)
# Include all files recursively.
@ -223,7 +231,7 @@ def main() -> None:
packages=["bpy"],
package_data={"": package_files("bpy")},
distclass=BinaryDistribution,
options={"bdist_wheel": {"plat_name": platform_tag}},
options={"bdist_wheel": {"plat_name": platform_tag, "python_tag": python_tag}},
description="Blender as a Python module",
long_description=long_description,

View File

@ -409,7 +409,7 @@ void FFMPEGReader::seek(int position)
{
double pts_time_base = av_q2d(m_formatCtx->streams[m_stream]->time_base);
uint64_t st_time = m_formatCtx->streams[m_stream]->start_time;
int64_t st_time = m_formatCtx->streams[m_stream]->start_time;
uint64_t seek_pos = (uint64_t)(position / (pts_time_base * m_specs.rate));
if(st_time != AV_NOPTS_VALUE)

View File

@ -222,7 +222,7 @@ draco::DataType getDataType(size_t componentType)
}
API(uint32_t)
encoderSetAttribute(Encoder *encoder, char *attributeName, size_t componentType, char *dataType, void *data)
encoderSetAttribute(Encoder *encoder, char *attributeName, size_t componentType, char *dataType, void *data, bool normalized)
{
auto buffer = std::make_unique<draco::DataBuffer>();
uint32_t count = encoder->mesh.num_points();
@ -232,7 +232,7 @@ encoderSetAttribute(Encoder *encoder, char *attributeName, size_t componentType,
draco::GeometryAttribute::Type semantics = getAttributeSemantics(attributeName);
draco::GeometryAttribute attribute;
attribute.Init(semantics, &*buffer, componentCount, getDataType(componentType), false, stride, 0);
attribute.Init(semantics, &*buffer, componentCount, getDataType(componentType), normalized, stride, 0);
auto id = static_cast<uint32_t>(encoder->mesh.AddAttribute(attribute, true, count));
auto dataBytes = reinterpret_cast<uint8_t *>(data);

View File

@ -53,7 +53,7 @@ API(void)
encoderSetIndices(Encoder *encoder, size_t indexComponentType, uint32_t indexCount, void *indices);
API(uint32_t)
encoderSetAttribute(Encoder *encoder, char *attributeName, size_t componentType, char *dataType, void *data);
encoderSetAttribute(Encoder *encoder, char *attributeName, size_t componentType, char *dataType, void *data, bool normalized);
API(uint32_t)
encoderGetEncodedVertexCount(Encoder *encoder);

View File

@ -1683,7 +1683,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
elif device_type == 'ONEAPI':
import sys
if sys.platform.startswith("win"):
driver_version = "XX.X.101.5186"
driver_version = "XX.X.101.5518"
col.label(text=rpt_("Requires Intel GPU with Xe-HPG architecture"), icon='BLANK1', translate=False)
col.label(text=rpt_("and Windows driver version %s or newer") % driver_version,
icon='BLANK1', translate=False)

View File

@ -2437,7 +2437,7 @@ class CYCLES_VIEW3D_PT_shading_lighting(Panel):
sub.template_icon_view(shading, "studio_light", scale_popup=3)
col = split.column()
col.operator("preferences.studiolight_show", emboss=False, text="", icon='PREFERENCES')
col.operator("screen.userpref_show", emboss=False, text="", icon='PREFERENCES').section = 'LIGHTS'
split = layout.split(factor=0.9)
col = split.column()

View File

@ -178,6 +178,51 @@ void BVHEmbree::build(Progress &progress,
rtcCommitScene(scene);
}
string BVHEmbree::get_last_error_message()
{
const RTCError error_code = rtcGetDeviceError(rtc_device);
switch (error_code) {
case RTC_ERROR_NONE:
return "no error";
case RTC_ERROR_UNKNOWN:
return "unknown error";
case RTC_ERROR_INVALID_ARGUMENT:
return "invalid argument error";
case RTC_ERROR_INVALID_OPERATION:
return "invalid operation error";
case RTC_ERROR_OUT_OF_MEMORY:
return "out of memory error";
case RTC_ERROR_UNSUPPORTED_CPU:
return "unsupported cpu error";
case RTC_ERROR_CANCELLED:
return "cancelled";
default:
/* We should never end here unless enum for RTC errors would change. */
return "unknown error";
}
}
# if defined(WITH_EMBREE_GPU) && RTC_VERSION >= 40302
bool BVHEmbree::offload_scenes_to_gpu(const vector<RTCScene> &scenes)
{
/* Having BVH on GPU is more performance-critical than texture data.
* In order to ensure good performance even when running out of GPU
* memory, we force BVH to migrate to GPU before allocating other textures
* that may not fit. */
for (const RTCScene &embree_scene : scenes) {
RTCSceneFlags scene_flags = rtcGetSceneFlags(embree_scene);
scene_flags = scene_flags | RTC_SCENE_FLAG_PREFETCH_USM_SHARED_ON_GPU;
rtcSetSceneFlags(embree_scene, scene_flags);
rtcCommitScene(embree_scene);
/* In case of any errors from Embree, we should stop
* the execution and propagate the error. */
if (rtcGetDeviceError(rtc_device) != RTC_ERROR_NONE)
return false;
}
return true;
}
# endif
void BVHEmbree::add_object(Object *ob, int i)
{
Geometry *geom = ob->get_geometry();

View File

@ -18,6 +18,7 @@
# include "bvh/bvh.h"
# include "bvh/params.h"
# include "util/string.h"
# include "util/thread.h"
# include "util/types.h"
# include "util/vector.h"
@ -36,6 +37,12 @@ class BVHEmbree : public BVH {
const bool isSyclEmbreeDevice = false);
void refit(Progress &progress);
# if defined(WITH_EMBREE_GPU) && RTC_VERSION >= 40302
bool offload_scenes_to_gpu(const vector<RTCScene> &scenes);
# endif
string get_last_error_message();
RTCScene scene;
protected:

View File

@ -257,6 +257,7 @@ class device_memory {
friend class OptiXDevice;
friend class HIPDevice;
friend class MetalDevice;
friend class OneapiDevice;
/* Only create through subclasses. */
device_memory(Device *device, const char *name, MemoryType type);

View File

@ -50,11 +50,9 @@ bool device_oneapi_init()
_putenv_s("ONEAPI_DEVICE_SELECTOR", "!opencl:*");
}
}
if (getenv("SYCL_ENABLE_PCI") == nullptr) {
_putenv_s("SYCL_ENABLE_PCI", "1");
}
if (getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE") == nullptr) {
_putenv_s("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0");
/* SYSMAN is needed for free_memory queries. */
if (getenv("ZES_ENABLE_SYSMAN") == nullptr) {
_putenv_s("ZES_ENABLE_SYSMAN", "1");
}
if (getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE") == nullptr) {
_putenv_s("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE", "0");
@ -68,8 +66,9 @@ bool device_oneapi_init()
else {
setenv("ONEAPI_DEVICE_SELECTOR", "!opencl:*", false);
}
setenv("SYCL_ENABLE_PCI", "1", false);
setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE", "0", false);
/* SYSMAN is needed for free_memory queries. However, it leads to runtime driver issues on Linux
* when using it with JEMALLOC, so we set it to 0 by default until it's fixed. */
setenv("ZES_ENABLE_SYSMAN", "0", false);
setenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE", "0", false);
# endif

View File

@ -11,6 +11,7 @@
# include "device/oneapi/device_impl.h"
# include "util/debug.h"
# include "util/foreach.h"
# include "util/log.h"
# ifdef WITH_EMBREE_GPU
@ -47,18 +48,20 @@ static void queue_error_cb(const char *message, void *user_ptr)
}
OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
: Device(info, stats, profiler),
: GPUDevice(info, stats, profiler),
device_queue_(nullptr),
# ifdef WITH_EMBREE_GPU
embree_device(nullptr),
embree_scene(nullptr),
# endif
texture_info_(this, "texture_info", MEM_GLOBAL),
kg_memory_(nullptr),
kg_memory_device_(nullptr),
kg_memory_size_(0)
{
need_texture_info_ = false;
/* Verify that base class types can be used with specific backend types */
static_assert(sizeof(texMemObject) == sizeof(void *));
static_assert(sizeof(arrayMemObject) == sizeof(void *));
use_hardware_raytracing = info.use_hardware_raytracing;
oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
@ -110,6 +113,18 @@ OneapiDevice::OneapiDevice(const DeviceInfo &info, Stats &stats, Profiler &profi
kg_memory_size_ = globals_segment_size;
max_memory_on_device_ = get_memcapacity();
init_host_memory();
move_texture_to_host = false;
can_map_host = true;
const char *headroom_str = getenv("CYCLES_ONEAPI_MEMORY_HEADROOM");
if (headroom_str != nullptr) {
const long long override_headroom = (float)atoll(headroom_str);
device_working_headroom = override_headroom;
device_texture_headroom = override_headroom;
}
VLOG_DEBUG << "oneAPI memory headroom size: "
<< string_human_readable_size(device_working_headroom);
}
OneapiDevice::~OneapiDevice()
@ -119,7 +134,7 @@ OneapiDevice::~OneapiDevice()
rtcReleaseDevice(embree_device);
# endif
texture_info_.free();
texture_info.free();
usm_free(device_queue_, kg_memory_);
usm_free(device_queue_, kg_memory_device_);
@ -166,8 +181,22 @@ void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
else {
bvh_embree->build(progress, &stats, embree_device, true);
}
# if RTC_VERSION >= 40302
thread_scoped_lock lock(scene_data_mutex);
all_embree_scenes.push_back(bvh_embree->scene);
# endif
if (bvh->params.top_level) {
embree_scene = bvh_embree->scene;
# if RTC_VERSION >= 40302
if (bvh_embree->offload_scenes_to_gpu(all_embree_scenes) == false) {
set_error(
string_printf("BVH failed to to migrate to the GPU due to Embree library error (%s)",
bvh_embree->get_last_error_message()));
}
all_embree_scenes.clear();
# endif
}
}
else {
@ -176,6 +205,26 @@ void OneapiDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
}
# endif
size_t OneapiDevice::get_free_mem() const
{
/* Accurate: Use device info, which is practically useful only on dGPU.
* This is because for non-discrete GPUs, all GPU memory allocations would
* be in the RAM, thus having the same performance for device and host pointers,
* so there is no need to be very accurate about what would end where. */
const sycl::device &device = reinterpret_cast<sycl::queue *>(device_queue_)->get_device();
const bool is_integrated_gpu = device.get_info<sycl::info::device::host_unified_memory>();
if (device.has(sycl::aspect::ext_intel_free_memory) && is_integrated_gpu == false) {
return device.get_info<sycl::ext::intel::info::device::free_memory>();
}
/* Estimate: Capacity - in use. */
else if (device_mem_in_use < max_memory_on_device_) {
return max_memory_on_device_ - device_mem_in_use;
}
else {
return 0;
}
}
bool OneapiDevice::load_kernels(const uint requested_features)
{
assert(device_queue_);
@ -208,63 +257,101 @@ bool OneapiDevice::load_kernels(const uint requested_features)
VLOG_INFO << "Kernels loading (compilation) has been done for \"" << info.description << "\"";
}
if (is_finished_ok) {
reserve_private_memory(requested_features);
is_finished_ok = !have_error();
}
return is_finished_ok;
}
void OneapiDevice::load_texture_info()
void OneapiDevice::reserve_private_memory(const uint kernel_features)
{
if (need_texture_info_) {
need_texture_info_ = false;
texture_info_.copy_to_device();
size_t free_before = get_free_mem();
/* Use the biggest kernel for estimation. */
const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE :
(kernel_features & KERNEL_FEATURE_MNEE) ?
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE :
DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
{
unique_ptr<DeviceQueue> queue = gpu_queue_create();
device_ptr d_path_index = 0;
device_ptr d_render_buffer = 0;
int d_work_size = 0;
DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
queue->init_execution();
/* Launch of the kernel seems to be sufficient to reserve all
* needed memory regardless of the execution global size.
* So, the smallest possible size is used here. */
queue->enqueue(test_kernel, 1, args);
queue->synchronize();
}
size_t free_after = get_free_mem();
VLOG_INFO << "For kernel execution were reserved "
<< string_human_readable_number(free_before - free_after) << " bytes. ("
<< string_human_readable_size(free_before - free_after) << ")";
}
void OneapiDevice::generic_alloc(device_memory &mem)
void OneapiDevice::get_device_memory_info(size_t &total, size_t &free)
{
size_t memory_size = mem.memory_size();
/* TODO(@nsirgien): In future, if scene doesn't fit into device memory, then
* we can use USM host memory.
* Because of the expected performance impact, implementation of this has had a low priority
* and is not implemented yet. */
assert(device_queue_);
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
* and shared. For new project it maybe more beneficial to use USM shared memory, because it
* provides automatic migration mechanism in order to allow to use the same pointer on host and
* on device, without need to worry about explicit memory transfer operations. But for
* Blender/Cycles this type of memory is not very suitable in current application architecture,
* because Cycles already uses two different pointer for host activity and device activity, and
* also has to perform all needed memory transfer operations. So, USM device memory
* type has been used for oneAPI device in order to better fit in Cycles architecture. */
void *device_pointer = nullptr;
if (mem.memory_size() + stats.mem_used < max_memory_on_device_)
device_pointer = usm_alloc_device(device_queue_, memory_size);
if (device_pointer == nullptr) {
set_error("oneAPI kernel - device memory allocation error for " +
string_human_readable_size(mem.memory_size()) +
", possibly caused by lack of available memory space on the device: " +
string_human_readable_size(stats.mem_used) + " of " +
string_human_readable_size(max_memory_on_device_) + " is already allocated");
}
mem.device_pointer = reinterpret_cast<ccl::device_ptr>(device_pointer);
mem.device_size = memory_size;
stats.mem_alloc(memory_size);
free = get_free_mem();
total = max_memory_on_device_;
}
void OneapiDevice::generic_copy_to(device_memory &mem)
bool OneapiDevice::alloc_device(void *&device_pointer, size_t size)
{
if (!mem.device_pointer) {
return;
}
size_t memory_size = mem.memory_size();
bool allocation_success = false;
device_pointer = usm_alloc_device(device_queue_, size);
if (device_pointer != nullptr) {
allocation_success = true;
/* Due to lazy memory initialization in GPU runtime we will force memory to
* appear in device memory via execution of a kernel using this memory. */
if (!oneapi_zero_memory_on_device(device_queue_, device_pointer, size)) {
set_error("oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
"\"");
usm_free(device_queue_, device_pointer);
/* Copy operation from host shouldn't be requested if there is no memory allocated on host. */
assert(mem.host_pointer);
assert(device_queue_);
usm_memcpy(device_queue_, (void *)mem.device_pointer, (void *)mem.host_pointer, memory_size);
device_pointer = nullptr;
allocation_success = false;
}
}
return allocation_success;
}
void OneapiDevice::free_device(void *device_pointer)
{
usm_free(device_queue_, device_pointer);
}
bool OneapiDevice::alloc_host(void *&shared_pointer, size_t size)
{
shared_pointer = usm_aligned_alloc_host(device_queue_, size, 64);
return shared_pointer != nullptr;
}
void OneapiDevice::free_host(void *shared_pointer)
{
usm_free(device_queue_, shared_pointer);
}
void OneapiDevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
{
/* Device and host pointer are in the same address space
* as we're using Unified Shared Memory. */
device_pointer = shared_pointer;
}
void OneapiDevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
{
usm_memcpy(device_queue_, device_pointer, host_pointer, size);
}
/* TODO: Make sycl::queue part of OneapiQueue and avoid using pointers to sycl::queue. */
@ -288,20 +375,6 @@ void *OneapiDevice::kernel_globals_device_pointer()
return kg_memory_device_;
}
void OneapiDevice::generic_free(device_memory &mem)
{
if (!mem.device_pointer) {
return;
}
stats.mem_free(mem.device_size);
mem.device_size = 0;
assert(device_queue_);
usm_free(device_queue_, (void *)mem.device_pointer);
mem.device_pointer = 0;
}
void OneapiDevice::mem_alloc(device_memory &mem)
{
if (mem.type == MEM_TEXTURE) {
@ -344,7 +417,7 @@ void OneapiDevice::mem_copy_to(device_memory &mem)
}
else {
if (!mem.device_pointer)
mem_alloc(mem);
generic_alloc(mem);
generic_copy_to(mem);
}
@ -515,14 +588,14 @@ void OneapiDevice::tex_alloc(device_texture &mem)
/* Resize if needed. Also, in case of resize - allocate in advance for future allocations. */
const uint slot = mem.slot;
if (slot >= texture_info_.size()) {
texture_info_.resize(slot + 128);
if (slot >= texture_info.size()) {
texture_info.resize(slot + 128);
}
texture_info_[slot] = mem.info;
need_texture_info_ = true;
texture_info[slot] = mem.info;
need_texture_info = true;
texture_info_[slot].data = (uint64_t)mem.device_pointer;
texture_info[slot].data = (uint64_t)mem.device_pointer;
}
void OneapiDevice::tex_free(device_texture &mem)
@ -628,6 +701,16 @@ void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
/* NOTE(@nsirgien): There are three types of Unified Shared Memory (USM) in oneAPI: host, device
* and shared. For new project it could more beneficial to use USM shared memory, because it
* provides automatic migration mechanism in order to allow to use the same pointer on host and
* on device, without need to worry about explicit memory transfer operations, although usage of
* USM shared imply some documented limitations on the memory usage in regards of parallel access
* from different threads. But for Blender/Cycles this type of memory is not very suitable in
* current application architecture, because Cycles is multi-thread application and already uses
* two different pointer for host activity and device activity, and also has to perform all
* needed memory transfer operations. So, USM device memory type has been used for oneAPI device
* in order to better fit in Cycles architecture. */
# ifndef WITH_ONEAPI_SYCL_HOST_TASK
return sycl::malloc_device(memory_size, *queue);
# else
@ -646,9 +729,26 @@ void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t num_bytes)
{
assert(queue_);
/* sycl::queue::memcpy may crash if the queue is in an invalid state due to previous
* runtime errors. It's better to avoid running memory operations in that case.
* The render will be canceled and the queue will be destroyed anyway. */
if (have_error())
return false;
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, dest, true);
OneapiDevice::check_usm(queue_, src, true);
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
/* Unknown here means, that this is not an USM allocation, which implies that this is
* some generic C++ allocation, so we could use C++ memcpy directly with USM host. */
if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) &&
(src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown))
{
memcpy(dest, src, num_bytes);
return true;
}
try {
sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
# ifdef WITH_CYCLES_DEBUG
@ -658,8 +758,6 @@ bool OneapiDevice::usm_memcpy(SyclQueue *queue_, void *dest, void *src, size_t n
mem_event.wait_and_throw();
return true;
# else
sycl::usm::alloc dest_type = get_pointer_type(dest, queue->get_context());
sycl::usm::alloc src_type = get_pointer_type(src, queue->get_context());
bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
src_type == sycl::usm::alloc::device;
bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
@ -684,6 +782,12 @@ bool OneapiDevice::usm_memset(SyclQueue *queue_,
size_t num_bytes)
{
assert(queue_);
/* sycl::queue::memset may crash if the queue is in an invalid state due to previous
* runtime errors. It's better to avoid running memory operations in that case.
* The render will be canceled and the queue will be destroyed anyway. */
if (have_error())
return false;
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
OneapiDevice::check_usm(queue_, usm_ptr, true);
try {
@ -735,7 +839,7 @@ void OneapiDevice::set_global_memory(SyclQueue *queue_,
assert(memory_name);
assert(memory_device_pointer);
KernelGlobalsGPU *globals = (KernelGlobalsGPU *)kernel_globals;
OneapiDevice::check_usm(queue_, memory_device_pointer);
OneapiDevice::check_usm(queue_, memory_device_pointer, true);
OneapiDevice::check_usm(queue_, kernel_globals, true);
std::string matched_name(memory_name);
@ -874,11 +978,11 @@ void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
/* Compute-runtime (ie. NEO) version is what gets returned by sycl/L0 on Windows
* since Windows driver 101.3268. */
static const int lowest_supported_driver_version_win = 1015186;
static const int lowest_supported_driver_version_win = 1015518;
# ifdef _WIN32
/* For Windows driver 101.5186, compute-runtime version is 28044.
/* For Windows driver 101.5518, compute-runtime version is 28044.
* This information is returned by `ocloc query OCL_DRIVER_VERSION`.*/
static const int lowest_supported_driver_version_neo = 28044;
static const int lowest_supported_driver_version_neo = 29283;
# else
static const int lowest_supported_driver_version_neo = 27642;
# endif

View File

@ -21,17 +21,19 @@ typedef void (*OneAPIDeviceIteratorCallback)(const char *id,
bool oidn_support,
void *user_ptr);
class OneapiDevice : public Device {
class OneapiDevice : public GPUDevice {
private:
SyclQueue *device_queue_;
# ifdef WITH_EMBREE_GPU
RTCDevice embree_device;
RTCScene embree_scene;
# if RTC_VERSION >= 40302
thread_mutex scene_data_mutex;
vector<RTCScene> all_embree_scenes;
# endif
# endif
using ConstMemMap = map<string, device_vector<uchar> *>;
ConstMemMap const_mem_map_;
device_vector<TextureInfo> texture_info_;
bool need_texture_info_;
void *kg_memory_;
void *kg_memory_device_;
size_t kg_memory_size_ = (size_t)0;
@ -41,6 +43,8 @@ class OneapiDevice : public Device {
unsigned int kernel_features = 0;
int scene_max_shaders_ = 0;
size_t get_free_mem() const;
public:
virtual BVHLayoutMask get_bvh_layout_mask(uint kernel_features) const override;
@ -54,13 +58,15 @@ class OneapiDevice : public Device {
bool load_kernels(const uint kernel_features) override;
void load_texture_info();
void reserve_private_memory(const uint kernel_features);
void generic_alloc(device_memory &mem);
void generic_copy_to(device_memory &mem);
void generic_free(device_memory &mem);
virtual void get_device_memory_info(size_t &total, size_t &free) override;
virtual bool alloc_device(void *&device_pointer, size_t size) override;
virtual void free_device(void *device_pointer) override;
virtual bool alloc_host(void *&shared_pointer, size_t size) override;
virtual void free_host(void *shared_pointer) override;
virtual void transform_host_pointer(void *&device_pointer, void *&shared_pointer) override;
virtual void copy_host_to_device(void *device_pointer, void *host_pointer, size_t size) override;
string oneapi_error_message();

View File

@ -142,11 +142,13 @@ ccl_device_inline float longitudinal_scattering(
if (v <= 0.1f) {
float i0 = log_bessel_I0(cos_arg);
float val = expf(i0 - sin_arg - inv_v + 0.6931f + logf(0.5f * inv_v));
kernel_assert(isfinite_safe(val));
return val;
}
else {
float i0 = bessel_I0(cos_arg);
float val = (expf(-sin_arg) * i0) / (sinhf(inv_v) * 2.0f * v);
kernel_assert(isfinite_safe(val));
return val;
}
}
@ -185,7 +187,7 @@ ccl_device int bsdf_hair_chiang_setup(ccl_private ShaderData *sd, ccl_private Ch
kernel_assert(isfinite_safe(bsdf->h));
bsdf->N = Y;
bsdf->alpha = -bsdf->alpha;
return SD_BSDF | SD_BSDF_HAS_EVAL | SD_BSDF_HAS_TRANSMISSION;
}
@ -224,9 +226,9 @@ ccl_device_inline void hair_attenuation(
Ap_energy[3] *= fac;
}
/* Given the tilt angle, generate the rotated theta_i for the different bounces. */
ccl_device_inline void hair_alpha_angles(float sin_theta_i,
float cos_theta_i,
/* Update sin_theta_o and cos_theta_o to account for scale tilt for each bounce. */
ccl_device_inline void hair_alpha_angles(float sin_theta_o,
float cos_theta_o,
float alpha,
ccl_private float *angles)
{
@ -237,12 +239,12 @@ ccl_device_inline void hair_alpha_angles(float sin_theta_i,
float sin_4alpha = 2.0f * sin_2alpha * cos_2alpha;
float cos_4alpha = sqr(cos_2alpha) - sqr(sin_2alpha);
angles[0] = sin_theta_i * cos_2alpha + cos_theta_i * sin_2alpha;
angles[1] = fabsf(cos_theta_i * cos_2alpha - sin_theta_i * sin_2alpha);
angles[2] = sin_theta_i * cos_1alpha - cos_theta_i * sin_1alpha;
angles[3] = fabsf(cos_theta_i * cos_1alpha + sin_theta_i * sin_1alpha);
angles[4] = sin_theta_i * cos_4alpha - cos_theta_i * sin_4alpha;
angles[5] = fabsf(cos_theta_i * cos_4alpha + sin_theta_i * sin_4alpha);
angles[0] = sin_theta_o * cos_2alpha - cos_theta_o * sin_2alpha;
angles[1] = fabsf(cos_theta_o * cos_2alpha + sin_theta_o * sin_2alpha);
angles[2] = sin_theta_o * cos_1alpha + cos_theta_o * sin_1alpha;
angles[3] = fabsf(cos_theta_o * cos_1alpha - sin_theta_o * sin_1alpha);
angles[4] = sin_theta_o * cos_4alpha + cos_theta_o * sin_4alpha;
angles[5] = fabsf(cos_theta_o * cos_4alpha - sin_theta_o * sin_4alpha);
}
/* Evaluation function for our shader. */
@ -293,17 +295,17 @@ ccl_device Spectrum bsdf_hair_chiang_eval(KernelGlobals kg,
const float phi = phi_i - phi_o;
float angles[6];
hair_alpha_angles(sin_theta_i, cos_theta_i, bsdf->alpha, angles);
hair_alpha_angles(sin_theta_o, cos_theta_o, bsdf->alpha, angles);
Spectrum F = zero_spectrum();
float F_energy = 0.0f;
/* Primary specular (R), Transmission (TT) and Secondary Specular (TRT). */
for (int i = 0; i < 3; i++) {
const float Mp = longitudinal_scattering(angles[2 * i],
const float Mp = longitudinal_scattering(sin_theta_i,
cos_theta_i,
angles[2 * i],
angles[2 * i + 1],
sin_theta_o,
cos_theta_o,
(i == 0) ? bsdf->m0_roughness :
(i == 1) ? 0.25f * bsdf->v :
4.0f * bsdf->v);
@ -347,6 +349,7 @@ ccl_device int bsdf_hair_chiang_sample(KernelGlobals kg,
kernel_assert(fabsf(dot(X, Y)) < 1e-3f);
const float3 Z = safe_normalize(cross(X, Y));
/* `wo` in PBRT. */
const float3 local_O = make_float3(dot(sd->wi, X), dot(sd->wi, Y), dot(sd->wi, Z));
const float sin_theta_o = local_O.x;
@ -387,19 +390,20 @@ ccl_device int bsdf_hair_chiang_sample(KernelGlobals kg,
v *= 4.0f;
}
float angles[6];
hair_alpha_angles(sin_theta_o, cos_theta_o, bsdf->alpha, angles);
float sin_theta_o_tilted = sin_theta_o;
float cos_theta_o_tilted = cos_theta_o;
if (p < 3) {
sin_theta_o_tilted = angles[2 * p];
cos_theta_o_tilted = angles[2 * p + 1];
}
rand.z = max(rand.z, 1e-5f);
const float fac = 1.0f + v * logf(rand.z + (1.0f - rand.z) * expf(-2.0f / v));
float sin_theta_i = -fac * sin_theta_o +
cos_from_sin(fac) * cosf(M_2PI_F * rand.y) * cos_theta_o;
float sin_theta_i = -fac * sin_theta_o_tilted +
sin_from_cos(fac) * cosf(M_2PI_F * rand.y) * cos_theta_o_tilted;
float cos_theta_i = cos_from_sin(sin_theta_i);
float angles[6];
if (p < 3) {
hair_alpha_angles(sin_theta_i, cos_theta_i, -bsdf->alpha, angles);
sin_theta_i = angles[2 * p];
cos_theta_i = angles[2 * p + 1];
}
float phi;
if (p < 3) {
phi = delta_phi(p, gamma_o, gamma_t) + sample_trimmed_logistic(rand.x, bsdf->s);
@ -409,17 +413,15 @@ ccl_device int bsdf_hair_chiang_sample(KernelGlobals kg,
}
const float phi_i = phi_o + phi;
hair_alpha_angles(sin_theta_i, cos_theta_i, bsdf->alpha, angles);
Spectrum F = zero_spectrum();
float F_energy = 0.0f;
/* Primary specular (R), Transmission (TT) and Secondary Specular (TRT). */
for (int i = 0; i < 3; i++) {
const float Mp = longitudinal_scattering(angles[2 * i],
const float Mp = longitudinal_scattering(sin_theta_i,
cos_theta_i,
angles[2 * i],
angles[2 * i + 1],
sin_theta_o,
cos_theta_o,
(i == 0) ? bsdf->m0_roughness :
(i == 1) ? 0.25f * bsdf->v :
4.0f * bsdf->v);

View File

@ -133,6 +133,26 @@ bool oneapi_run_test_kernel(SyclQueue *queue_)
return is_computation_correct;
}
bool oneapi_zero_memory_on_device(SyclQueue *queue_, void *device_pointer, size_t num_bytes)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
try {
queue->submit([&](sycl::handler &cgh) {
cgh.parallel_for(num_bytes,
[=](sycl::id<1> idx) { ((char *)device_pointer)[idx.get(0)] = (char)0; });
});
queue->wait_and_throw();
return true;
}
catch (sycl::exception const &e) {
if (s_error_cb) {
s_error_cb(e.what(), s_error_user_ptr);
}
return false;
}
}
bool oneapi_kernel_is_required_for_features(const std::string &kernel_name,
const uint kernel_features)
{

View File

@ -44,6 +44,9 @@ extern "C" {
# endif
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_run_test_kernel(SyclQueue *queue_);
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_zero_memory_on_device(SyclQueue *queue_,
void *device_pointer,
size_t num_bytes);
CYCLES_KERNEL_ONEAPI_EXPORT void oneapi_set_error_cb(OneAPIErrorCallback cb, void *user_ptr);
CYCLES_KERNEL_ONEAPI_EXPORT size_t oneapi_suggested_gpu_kernel_size(const DeviceKernel kernel);
CYCLES_KERNEL_ONEAPI_EXPORT bool oneapi_enqueue_kernel(KernelContext *context,

View File

@ -39,6 +39,11 @@ ccl_device int shadow_linking_pick_mesh_intersection(KernelGlobals kg,
const uint visibility = path_state_ray_visibility(state);
int transparent_bounce = INTEGRATOR_STATE(state, path, transparent_bounce);
int volume_bounce = INTEGRATOR_STATE(state, path, volume_bounce);
/* TODO: Replace the look with sequential calls to the kernel, similar to the transparent shadow
* intersection kernel. */
for (int i = 0; i < SHADOW_LINK_MAX_INTERSECTION_COUNT; i++) {
Intersection current_isect ccl_optional_struct_init;
current_isect.object = OBJECT_NONE;
@ -68,12 +73,33 @@ ccl_device int shadow_linking_pick_mesh_intersection(KernelGlobals kg,
}
}
const uint blocker_set = kernel_data_fetch(objects, current_isect.object).blocker_shadow_set;
if (blocker_set == 0) {
/* Contribution from the lights past the default blocker is accumulated using the main path.
*/
ray->tmax = current_isect.t;
break;
/* Contribution from the lights past the default opaque blocker is accumulated
* using the main path. */
if (!(shader_flags & (SD_HAS_ONLY_VOLUME | SD_HAS_TRANSPARENT_SHADOW))) {
const uint blocker_set = kernel_data_fetch(objects, current_isect.object).blocker_shadow_set;
if (blocker_set == 0) {
ray->tmax = current_isect.t;
break;
}
}
else {
/* Lights past the maximum allowed transparency bounce do not contribute any light, so
* consider them as fully blocked and only consider lights prior to this intersection. */
if (shader_flags & SD_HAS_TRANSPARENT_SHADOW) {
++transparent_bounce;
if (transparent_bounce >= kernel_data.integrator.transparent_max_bounce) {
ray->tmax = current_isect.t;
break;
}
}
else {
kernel_assert(shader_flags & SD_HAS_ONLY_VOLUME);
++volume_bounce;
if (volume_bounce >= kernel_data.integrator.max_volume_bounce) {
ray->tmax = current_isect.t;
break;
}
}
}
/* Move the ray forward. */

View File

@ -177,27 +177,27 @@ ccl_device_forceinline bool triangle_light_sample(KernelGlobals kg,
const float cos_a = dot(B, C);
const float cos_b = dot(A, C);
const float cos_c = dot(A, B);
const float sin_b_sin_c_2 = (1.0f - sqr(cos_b)) * (1.0f - sqr(cos_c));
const float mixed_product = fabsf(dot(A, cross(B, C)));
/* The area of the spherical triangle is equal to the subtended solid angle. */
const float solid_angle = 2.0f * fast_atan2f(mixed_product, (1.0f + cos_a + cos_b + cos_c));
/* Compute the angle at A. */
const float cos_alpha = dot(safe_normalize(cross(A, B)), safe_normalize(cross(A, C)))