Cycles: Optimizations #116951

Closed
Odilkhon Yakubov wants to merge 2 commits from (deleted):main into main

When changing the target branch, be careful to rebase the branch in your fork to match. See documentation.
2 changed files with 190 additions and 289 deletions

View File

@ -7,7 +7,7 @@
CCL_NAMESPACE_BEGIN
/* Device Memory */
constexpr size_t MIN_ALIGNMENT_CPU_DATA_TYPES = 16;
device_memory::device_memory(Device *device, const char *name, MemoryType type)
: data_type(device_type_traits<uchar>::data_type),
@ -20,217 +20,190 @@ device_memory::device_memory(Device *device, const char *name, MemoryType type)
type(type),
name(name),
device(device),
device_pointer(0),
host_pointer(0),
shared_pointer(0),
device_pointer(nullptr),
host_pointer(nullptr),
shared_pointer(nullptr),
shared_counter(0),
original_device_ptr(0),
original_device_ptr(nullptr),
original_device_size(0),
original_device(0),
original_device(nullptr),
need_realloc_(false),
modified(false)
{
modified(false) {}
device_memory::~device_memory() {
assert(shared_pointer == nullptr);
assert(shared_counter == 0);
}
device_memory::~device_memory()
{
assert(shared_pointer == 0);
assert(shared_counter == 0);
}
void *device_memory::host_alloc(size_t size) {
if (size == 0) {
return nullptr;
}
void *device_memory::host_alloc(size_t size)
{
if (!size) {
return 0;
}
void *ptr = util_aligned_malloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES);
void *ptr = util_aligned_malloc(size, MIN_ALIGNMENT_CPU_DATA_TYPES);
if (!ptr) {
throw std::bad_alloc("Failed to allocate memory.");
}
if (ptr) {
util_guarded_mem_alloc(size);
}
else {
throw std::bad_alloc();
}
return ptr;
return ptr;
}
void device_memory::host_free()
{
if (host_pointer) {
util_guarded_mem_free(memory_size());
util_aligned_free((void *)host_pointer);
host_pointer = 0;
}
void device_memory::host_free() {
if (host_pointer) {
util_guarded_mem_free(memory_size());
util_aligned_free(host_pointer);
host_pointer = nullptr;
}
}
void device_memory::device_alloc()
{
assert(!device_pointer && type != MEM_TEXTURE && type != MEM_GLOBAL);
device->mem_alloc(*this);
void device_memory::device_alloc() {
assert(device_pointer == nullptr && type != MEM_TEXTURE && type != MEM_GLOBAL);
device->mem_alloc(*this);
}
void device_memory::device_free()
{
if (device_pointer) {
device->mem_free(*this);
}
void device_memory::device_free() {
if (device_pointer) {
device->mem_free(*this);
}
}
void device_memory::device_copy_to()
{
if (host_pointer) {
device->mem_copy_to(*this);
}
void device_memory::device_copy_to() {
if (host_pointer) {
device->mem_copy_to(*this);
}
}
void device_memory::device_copy_from(size_t y, size_t w, size_t h, size_t elem)
{
assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL);
device->mem_copy_from(*this, y, w, h, elem);
void device_memory::device_copy_from(size_t y, size_t w, size_t h, size_t elem) {
assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL);
device->mem_copy_from(*this, y, w, h, elem);
}
void device_memory::device_zero()
{
if (data_size) {
device->mem_zero(*this);
}
void device_memory::device_zero() {
if (data_size) {
device->mem_zero(*this);
}
}
bool device_memory::device_is_cpu()
{
return (device->info.type == DEVICE_CPU);
bool device_memory::device_is_cpu() {
return (device->info.type == DEVICE_CPU);
}
void device_memory::swap_device(Device *new_device,
size_t new_device_size,
device_ptr new_device_ptr)
{
original_device = device;
original_device_size = device_size;
original_device_ptr = device_pointer;
device_ptr new_device_ptr) {
original_device = device;
original_device_size = device_size;
original_device_ptr = device_pointer;
device = new_device;
device_size = new_device_size;
device_pointer = new_device_ptr;
device = new_device;
device_size = new_device_size;
device_pointer = new_device_ptr;
}
void device_memory::restore_device()
{
device = original_device;
device_size = original_device_size;
device_pointer = original_device_ptr;
void device_memory::restore_device() {
device = original_device;
device_size = original_device_size;
device_pointer = original_device_ptr;
}
bool device_memory::is_resident(Device *sub_device) const
{
return device->is_resident(device_pointer, sub_device);
bool device_memory::is_resident(Device *sub_device) const {
return device->is_resident(device_pointer, sub_device);
}
/* Device Sub `ptr`. */
device_sub_ptr::device_sub_ptr(device_memory &mem, size_t offset, size_t size) : device(mem.device)
{
ptr = device->mem_alloc_sub_ptr(mem, offset, size);
device_sub_ptr::device_sub_ptr(device_memory &mem, size_t offset, size_t size) : device(mem.device) {
ptr = device->mem_alloc_sub_ptr(mem, offset, size);
}
device_sub_ptr::~device_sub_ptr()
{
device->mem_free_sub_ptr(ptr);
device_sub_ptr::~device_sub_ptr() {
device->mem_free_sub_ptr(ptr);
}
/* Device Texture */
device_texture::device_texture(Device *device,
const char *name,
const uint slot,
ImageDataType image_data_type,
InterpolationType interpolation,
ExtensionType extension)
: device_memory(device, name, MEM_TEXTURE), slot(slot)
{
switch (image_data_type) {
case IMAGE_DATA_TYPE_FLOAT4:
data_type = TYPE_FLOAT;
data_elements = 4;
break;
case IMAGE_DATA_TYPE_FLOAT:
data_type = TYPE_FLOAT;
data_elements = 1;
break;
case IMAGE_DATA_TYPE_BYTE4:
data_type = TYPE_UCHAR;
data_elements = 4;
break;
case IMAGE_DATA_TYPE_BYTE:
case IMAGE_DATA_TYPE_NANOVDB_FLOAT:
case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
case IMAGE_DATA_TYPE_NANOVDB_FPN:
case IMAGE_DATA_TYPE_NANOVDB_FP16:
data_type = TYPE_UCHAR;
data_elements = 1;
break;
case IMAGE_DATA_TYPE_HALF4:
data_type = TYPE_HALF;
data_elements = 4;
break;
case IMAGE_DATA_TYPE_HALF:
data_type = TYPE_HALF;
data_elements = 1;
break;
case IMAGE_DATA_TYPE_USHORT4:
data_type = TYPE_UINT16;
data_elements = 4;
break;
case IMAGE_DATA_TYPE_USHORT:
data_type = TYPE_UINT16;
data_elements = 1;
break;
case IMAGE_DATA_NUM_TYPES:
assert(0);
return;
}
: device_memory(device, name, MEM_TEXTURE), slot(slot) {
switch (image_data_type) {
case IMAGE_DATA_TYPE_FLOAT4:
data_type = TYPE_FLOAT;
data_elements = 4;
break;
case IMAGE_DATA_TYPE_FLOAT:
data_type = TYPE_FLOAT;
data_elements = 1;
break;
case IMAGE_DATA_TYPE_BYTE4:
data_type = TYPE_UCHAR;
data_elements = 4;
break;
case IMAGE_DATA_TYPE_BYTE:
case IMAGE_DATA_TYPE_NANOVDB_FLOAT:
case IMAGE_DATA_TYPE_NANOVDB_FLOAT3:
case IMAGE_DATA_TYPE_NANOVDB_FPN:
case IMAGE_DATA_TYPE_NANOVDB_FP16:
data_type = TYPE_UCHAR;
data_elements = 1;
break;
case IMAGE_DATA_TYPE_HALF4:
data_type = TYPE_HALF;
data_elements = 4;
break;
case IMAGE_DATA_TYPE_HALF:
data_type = TYPE_HALF;
data_elements = 1;
break;
case IMAGE_DATA_TYPE_USHORT4:
data_type = TYPE_UINT16;
data_elements = 4;
break;
case IMAGE_DATA_TYPE_USHORT:
data_type = TYPE_UINT16;
data_elements = 1;
break;
case IMAGE_DATA_NUM_TYPES:
assert(0);
return;
}
memset(&info, 0, sizeof(info));
info.data_type = image_data_type;
info.interpolation = interpolation;
info.extension = extension;
memset(&info, 0, sizeof(info));
info.data_type = image_data_type;
info.interpolation = interpolation;
info.extension = extension;
}
device_texture::~device_texture()
{
device_free();
host_free();
}
/* Host memory allocation. */
void *device_texture::alloc(const size_t width, const size_t height, const size_t depth)
{
const size_t new_size = size(width, height, depth);
if (new_size != data_size) {
device_texture::~device_texture() {
device_free();
host_free();
host_pointer = host_alloc(data_elements * datatype_size(data_type) * new_size);
assert(device_pointer == 0);
}
data_size = new_size;
data_width = width;
data_height = height;
data_depth = depth;
info.width = width;
info.height = height;
info.depth = depth;
return host_pointer;
}
void device_texture::copy_to_device()
{
device_copy_to();
void *device_texture::alloc(const size_t width, const size_t height, const size_t depth) {
const size_t new_size = size(width, height, depth);
if (new_size != data_size) {
device_free();
host_free();
host_pointer = host_alloc(data_elements * datatype_size(data_type) * new_size);
assert(device_pointer == nullptr);
}
data_size = new_size;
data_width = width;
data_height = height;
data_depth = depth;
info.width = width;
info.height = height;
info.depth = depth;
return host_pointer;
}
void device_texture::copy_to_device() {
device_copy_to();
}
CCL_NAMESPACE_END

View File

@ -1,33 +1,16 @@
/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
*
* SPDX-License-Identifier: Apache-2.0 */
#include "kernel/sample/util.h"
#include "util/hash.h"
#pragma once
CCL_NAMESPACE_BEGIN
ccl_device uint tabulated_sobol_shuffled_sample_index(KernelGlobals kg,
uint sample,
uint dimension,
uint seed)
{
const uint sample_count = kernel_data.integrator.tabulated_sobol_sequence_size;
const uint sample_count = kernel_data.integrator.tabulated_sobol_sequence_size;
const uint pattern_i = hash_shuffle_uint(dimension, NUM_TAB_SOBOL_PATTERNS, seed);
const uint sample_mask = sample_count - 1;
/* Shuffle the pattern order and sample index to decorrelate
* dimensions and make the most of the finite patterns we have.
* The funky sample mask stuff is to ensure that we only shuffle
* *within* the current sample pattern, which is necessary to avoid
* early repeat pattern use. */
const uint pattern_i = hash_shuffle_uint(dimension, NUM_TAB_SOBOL_PATTERNS, seed);
/* sample_count should always be a power of two, so this results in a mask. */
const uint sample_mask = sample_count - 1;
const uint sample_shuffled = nested_uniform_scramble(sample,
hash_wang_seeded_uint(dimension, seed));
sample = (sample & ~sample_mask) | (sample_shuffled & sample_mask);
const uint sample_shuffled = nested_uniform_scramble(sample, hash_wang_seeded_uint(dimension, seed)) & sample_mask;
sample = (sample & ~sample_mask) | sample_shuffled;
return ((pattern_i * sample_count) + sample) % (sample_count * NUM_TAB_SOBOL_PATTERNS);
return ((pattern_i * sample_count) + sample) % (sample_count * NUM_TAB_SOBOL_PATTERNS);
}
ccl_device float tabulated_sobol_sample_1D(KernelGlobals kg,
@ -35,27 +18,16 @@ ccl_device float tabulated_sobol_sample_1D(KernelGlobals kg,
const uint rng_hash,
const uint dimension)
{
uint seed = rng_hash;
const uint seed = (kernel_data.integrator.scrambling_distance < 1.0f) ? kernel_data.integrator.seed : rng_hash;
const uint index = tabulated_sobol_shuffled_sample_index(kg, sample, dimension, seed);
float x = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS);
/* Use the same sample sequence seed for all pixels when using
* scrambling distance. */
if (kernel_data.integrator.scrambling_distance < 1.0f) {
seed = kernel_data.integrator.seed;
}
if (kernel_data.integrator.scrambling_distance < 1.0f) {
const float jitter_x = hash_wang_seeded_float(dimension, rng_hash) * kernel_data.integrator.scrambling_distance;
x += jitter_x - floorf(x + jitter_x);
}
/* Fetch the sample. */
const uint index = tabulated_sobol_shuffled_sample_index(kg, sample, dimension, seed);
float x = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS);
/* Do limited Cranley-Patterson rotation when using scrambling distance. */
if (kernel_data.integrator.scrambling_distance < 1.0f) {
const float jitter_x = hash_wang_seeded_float(dimension, rng_hash) *
kernel_data.integrator.scrambling_distance;
x += jitter_x;
x -= floorf(x);
}
return x;
return x;
}
ccl_device float2 tabulated_sobol_sample_2D(KernelGlobals kg,
@ -63,32 +35,20 @@ ccl_device float2 tabulated_sobol_sample_2D(KernelGlobals kg,
const uint rng_hash,
const uint dimension)
{
uint seed = rng_hash;
const uint seed = (kernel_data.integrator.scrambling_distance < 1.0f) ? kernel_data.integrator.seed : rng_hash;
const uint index = tabulated_sobol_shuffled_sample_index(kg, sample, dimension, seed);
/* Use the same sample sequence seed for all pixels when using
* scrambling distance. */
if (kernel_data.integrator.scrambling_distance < 1.0f) {
seed = kernel_data.integrator.seed;
}
float x = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS);
float y = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 1);
/* Fetch the sample. */
const uint index = tabulated_sobol_shuffled_sample_index(kg, sample, dimension, seed);
float x = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS);
float y = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 1);
if (kernel_data.integrator.scrambling_distance < 1.0f) {
const float jitter_x = hash_wang_seeded_float(dimension, rng_hash) * kernel_data.integrator.scrambling_distance;
const float jitter_y = hash_wang_seeded_float(dimension, rng_hash ^ 0xca0e1151) * kernel_data.integrator.scrambling_distance;
x += jitter_x - floorf(x + jitter_x);
y += jitter_y - floorf(y + jitter_y);
}
/* Do limited Cranley-Patterson rotation when using scrambling distance. */
if (kernel_data.integrator.scrambling_distance < 1.0f) {
const float jitter_x = hash_wang_seeded_float(dimension, rng_hash) *
kernel_data.integrator.scrambling_distance;
const float jitter_y = hash_wang_seeded_float(dimension, rng_hash ^ 0xca0e1151) *
kernel_data.integrator.scrambling_distance;
x += jitter_x;
y += jitter_y;
x -= floorf(x);
y -= floorf(y);
}
return make_float2(x, y);
return make_float2(x, y);
}
ccl_device float3 tabulated_sobol_sample_3D(KernelGlobals kg,
@ -96,37 +56,23 @@ ccl_device float3 tabulated_sobol_sample_3D(KernelGlobals kg,
const uint rng_hash,
const uint dimension)
{
uint seed = rng_hash;
const uint seed = (kernel_data.integrator.scrambling_distance < 1.0f) ? kernel_data.integrator.seed : rng_hash;
const uint index = tabulated_sobol_shuffled_sample_index(kg, sample, dimension, seed);
/* Use the same sample sequence seed for all pixels when using
* scrambling distance. */
if (kernel_data.integrator.scrambling_distance < 1.0f) {
seed = kernel_data.integrator.seed;
}
float x = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS);
float y = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 1);
float z = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 2);
/* Fetch the sample. */
const uint index = tabulated_sobol_shuffled_sample_index(kg, sample, dimension, seed);
float x = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS);
float y = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 1);
float z = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 2);
if (kernel_data.integrator.scrambling_distance < 1.0f) {
const float jitter_x = hash_wang_seeded_float(dimension, rng_hash) * kernel_data.integrator.scrambling_distance;
const float jitter_y = hash_wang_seeded_float(dimension, rng_hash ^ 0xca0e1151) * kernel_data.integrator.scrambling_distance;
const float jitter_z = hash_wang_seeded_float(dimension, rng_hash ^ 0xbf604c5a) * kernel_data.integrator.scrambling_distance;
x += jitter_x - floorf(x + jitter_x);
y += jitter_y - floorf(y + jitter_y);
z += jitter_z - floorf(z + jitter_z);
}
/* Do limited Cranley-Patterson rotation when using scrambling distance. */
if (kernel_data.integrator.scrambling_distance < 1.0f) {
const float jitter_x = hash_wang_seeded_float(dimension, rng_hash) *
kernel_data.integrator.scrambling_distance;
const float jitter_y = hash_wang_seeded_float(dimension, rng_hash ^ 0xca0e1151) *
kernel_data.integrator.scrambling_distance;
const float jitter_z = hash_wang_seeded_float(dimension, rng_hash ^ 0xbf604c5a) *
kernel_data.integrator.scrambling_distance;
x += jitter_x;
y += jitter_y;
z += jitter_z;
x -= floorf(x);
y -= floorf(y);
z -= floorf(z);
}
return make_float3(x, y, z);
return make_float3(x, y, z);
}
ccl_device float4 tabulated_sobol_sample_4D(KernelGlobals kg,
@ -134,42 +80,24 @@ ccl_device float4 tabulated_sobol_sample_4D(KernelGlobals kg,
const uint rng_hash,
const uint dimension)
{
uint seed = rng_hash;
const uint seed = (kernel_data.integrator.scrambling_distance < 1.0f) ? kernel_data.integrator.seed : rng_hash;
const uint index = tabulated_sobol_shuffled_sample_index(kg, sample, dimension, seed);
/* Use the same sample sequence seed for all pixels when using
* scrambling distance. */
if (kernel_data.integrator.scrambling_distance < 1.0f) {
seed = kernel_data.integrator.seed;
}
float x = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS);
float y = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 1);
float z = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 2);
float w = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 3);
/* Fetch the sample. */
const uint index = tabulated_sobol_shuffled_sample_index(kg, sample, dimension, seed);
float x = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS);
float y = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 1);
float z = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 2);
float w = kernel_data_fetch(sample_pattern_lut, index * NUM_TAB_SOBOL_DIMENSIONS + 3);
if (kernel_data.integrator.scrambling_distance < 1.0f) {
const float jitter_x = hash_wang_seeded_float(dimension, rng_hash) * kernel_data.integrator.scrambling_distance;
const float jitter_y = hash_wang_seeded_float(dimension, rng_hash ^ 0xca0e1151) * kernel_data.integrator.scrambling_distance;
const float jitter_z = hash_wang_seeded_float(dimension, rng_hash ^ 0xbf604c5a) * kernel_data.integrator.scrambling_distance;
const float jitter_w = hash_wang_seeded_float(dimension, rng_hash ^ 0x99634d1d) * kernel_data.integrator.scrambling_distance;
x += jitter_x - floorf(x + jitter_x);
y += jitter_y - floorf(y + jitter_y);
z += jitter_z - floorf(z + jitter_z);
w += jitter_w - floorf(w + jitter_w);
}
/* Do limited Cranley-Patterson rotation when using scrambling distance. */
if (kernel_data.integrator.scrambling_distance < 1.0f) {
const float jitter_x = hash_wang_seeded_float(dimension, rng_hash) *
kernel_data.integrator.scrambling_distance;
const float jitter_y = hash_wang_seeded_float(dimension, rng_hash ^ 0xca0e1151) *
kernel_data.integrator.scrambling_distance;
const float jitter_z = hash_wang_seeded_float(dimension, rng_hash ^ 0xbf604c5a) *
kernel_data.integrator.scrambling_distance;
const float jitter_w = hash_wang_seeded_float(dimension, rng_hash ^ 0x99634d1d) *
kernel_data.integrator.scrambling_distance;
x += jitter_x;
y += jitter_y;
z += jitter_z;
w += jitter_w;
x -= floorf(x);
y -= floorf(y);
z -= floorf(z);
w -= floorf(w);
}
return make_float4(x, y, z, w);
return make_float4(x, y, z, w);
}
CCL_NAMESPACE_END