This is the first of a sequence of changes to support compiling Cycles kernels as MSL (Metal Shading Language) in preparation for a Metal GPU device implementation. MSL requires that all pointer types be declared with explicit address space attributes (device, thread, etc...). There is already precedent for this with Cycles' address space macros (ccl_global, ccl_private, etc...), therefore the first step of MSL-enablement is to apply these consistently. Line-for-line this represents the largest change required to enable MSL. Applying this change first will simplify future patches as well as offering the emergent benefit of enhanced descriptiveness. The vast majority of deltas in this patch fall into one of two cases: - Ensuring ccl_private is specified for thread-local pointer types - Ensuring ccl_global is specified for device-wide pointer types Additionally, the ccl_addr_space qualifier can be removed. Prior to Cycles X, ccl_addr_space was used as a context-dependent address space qualifier, but now it is either redundant (e.g. in struct typedefs), or can be replaced by ccl_global in the case of pointer types. Associated function variants (e.g. lcg_step_float_addrspace) are also redundant. In cases where address space qualifiers are chained with "const", this patch places the address space qualifier first. The rationale for this is that the choice of address space is likely to have the greater impact on runtime performance and overall architecture. The final part of this patch is the addition of a metal/compat.h header. This is partially complete and will be extended in future patches, paving the way for the full Metal implementation. Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D12864
160 lines
6.2 KiB
C++
160 lines
6.2 KiB
C++
/*
|
|
* Copyright 2019 Blender Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "kernel/kernel_write_passes.h"
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
/* Check whether the pixel has converged and should not be sampled anymore. */
|
|
|
|
ccl_device_forceinline bool kernel_need_sample_pixel(INTEGRATOR_STATE_CONST_ARGS,
|
|
ccl_global float *render_buffer)
|
|
{
|
|
if (kernel_data.film.pass_adaptive_aux_buffer == PASS_UNUSED) {
|
|
return true;
|
|
}
|
|
|
|
const uint32_t render_pixel_index = INTEGRATOR_STATE(path, render_pixel_index);
|
|
const uint64_t render_buffer_offset = (uint64_t)render_pixel_index *
|
|
kernel_data.film.pass_stride;
|
|
ccl_global float *buffer = render_buffer + render_buffer_offset;
|
|
|
|
const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
|
|
return buffer[aux_w_offset] == 0.0f;
|
|
}
|
|
|
|
/* Determines whether to continue sampling a given pixel or if it has sufficiently converged. */
|
|
|
|
ccl_device bool kernel_adaptive_sampling_convergence_check(ccl_global const KernelGlobals *kg,
|
|
ccl_global float *render_buffer,
|
|
int x,
|
|
int y,
|
|
float threshold,
|
|
bool reset,
|
|
int offset,
|
|
int stride)
|
|
{
|
|
kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
|
|
kernel_assert(kernel_data.film.pass_sample_count != PASS_UNUSED);
|
|
|
|
const int render_pixel_index = offset + x + y * stride;
|
|
ccl_global float *buffer = render_buffer +
|
|
(uint64_t)render_pixel_index * kernel_data.film.pass_stride;
|
|
|
|
/* TODO(Stefan): Is this better in linear, sRGB or something else? */
|
|
|
|
const float4 A = kernel_read_pass_float4(buffer + kernel_data.film.pass_adaptive_aux_buffer);
|
|
if (!reset && A.w != 0.0f) {
|
|
/* If the pixel was considered converged, its state will not change in this kernel. Early
|
|
* output before doing any math.
|
|
*
|
|
* TODO(sergey): On a GPU it might be better to keep thread alive for better coherency? */
|
|
return true;
|
|
}
|
|
|
|
const float4 I = kernel_read_pass_float4(buffer + kernel_data.film.pass_combined);
|
|
|
|
const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
|
|
const float inv_sample = 1.0f / sample;
|
|
|
|
/* The per pixel error as seen in section 2.1 of
|
|
* "A hierarchical automatic stopping condition for Monte Carlo global illumination" */
|
|
const float error_difference = (fabsf(I.x - A.x) + fabsf(I.y - A.y) + fabsf(I.z - A.z)) *
|
|
inv_sample;
|
|
const float error_normalize = sqrtf((I.x + I.y + I.z) * inv_sample);
|
|
/* A small epsilon is added to the divisor to prevent division by zero. */
|
|
const float error = error_difference / (0.0001f + error_normalize);
|
|
const bool did_converge = (error < threshold);
|
|
|
|
const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
|
|
buffer[aux_w_offset] = did_converge;
|
|
|
|
return did_converge;
|
|
}
|
|
|
|
/* This is a simple box filter in two passes.
|
|
* When a pixel demands more adaptive samples, let its neighboring pixels draw more samples too. */
|
|
|
|
ccl_device void kernel_adaptive_sampling_filter_x(ccl_global const KernelGlobals *kg,
|
|
ccl_global float *render_buffer,
|
|
int y,
|
|
int start_x,
|
|
int width,
|
|
int offset,
|
|
int stride)
|
|
{
|
|
kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
|
|
|
|
bool prev = false;
|
|
for (int x = start_x; x < start_x + width; ++x) {
|
|
int index = offset + x + y * stride;
|
|
ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
|
|
const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
|
|
|
|
if (buffer[aux_w_offset] == 0.0f) {
|
|
if (x > start_x && !prev) {
|
|
index = index - 1;
|
|
buffer = render_buffer + index * kernel_data.film.pass_stride;
|
|
buffer[aux_w_offset] = 0.0f;
|
|
}
|
|
prev = true;
|
|
}
|
|
else {
|
|
if (prev) {
|
|
buffer[aux_w_offset] = 0.0f;
|
|
}
|
|
prev = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
ccl_device void kernel_adaptive_sampling_filter_y(ccl_global const KernelGlobals *kg,
|
|
ccl_global float *render_buffer,
|
|
int x,
|
|
int start_y,
|
|
int height,
|
|
int offset,
|
|
int stride)
|
|
{
|
|
kernel_assert(kernel_data.film.pass_adaptive_aux_buffer != PASS_UNUSED);
|
|
|
|
bool prev = false;
|
|
for (int y = start_y; y < start_y + height; ++y) {
|
|
int index = offset + x + y * stride;
|
|
ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
|
|
const uint aux_w_offset = kernel_data.film.pass_adaptive_aux_buffer + 3;
|
|
|
|
if (buffer[aux_w_offset] == 0.0f) {
|
|
if (y > start_y && !prev) {
|
|
index = index - stride;
|
|
buffer = render_buffer + index * kernel_data.film.pass_stride;
|
|
buffer[aux_w_offset] = 0.0f;
|
|
}
|
|
prev = true;
|
|
}
|
|
else {
|
|
if (prev) {
|
|
buffer[aux_w_offset] = 0.0f;
|
|
}
|
|
prev = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
CCL_NAMESPACE_END
|