This is the first of a sequence of changes to support compiling Cycles kernels as MSL (Metal Shading Language) in preparation for a Metal GPU device implementation. MSL requires that all pointer types be declared with explicit address space attributes (device, thread, etc...). There is already precedent for this with Cycles' address space macros (ccl_global, ccl_private, etc...), therefore the first step of MSL-enablement is to apply these consistently. Line-for-line this represents the largest change required to enable MSL. Applying this change first will simplify future patches as well as offering the emergent benefit of enhanced descriptiveness. The vast majority of deltas in this patch fall into one of two cases: - Ensuring ccl_private is specified for thread-local pointer types - Ensuring ccl_global is specified for device-wide pointer types Additionally, the ccl_addr_space qualifier can be removed. Prior to Cycles X, ccl_addr_space was used as a context-dependent address space qualifier, but now it is either redundant (e.g. in struct typedefs), or can be replaced by ccl_global in the case of pointer types. Associated function variants (e.g. lcg_step_float_addrspace) are also redundant. In cases where address space qualifiers are chained with "const", this patch places the address space qualifier first. The rationale for this is that the choice of address space is likely to have the greater impact on runtime performance and overall architecture. The final part of this patch is the addition of a metal/compat.h header. This is partially complete and will be extended in future patches, paving the way for the full Metal implementation. Ref T92212 Reviewed By: brecht Maniphest Tasks: T92212 Differential Revision: https://developer.blender.org/D12864
107 lines
3.9 KiB
C++
107 lines
3.9 KiB
C++
/*
|
|
* Copyright 2018 Blender Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
CCL_NAMESPACE_BEGIN
|
|
|
|
/* Element of ID pass stored in the render buffers.
|
|
* It is `float2` semantically, but it must be unaligned since the offset of ID passes in the
|
|
* render buffers might not meet expected by compiler alignment. */
|
|
typedef struct IDPassBufferElement {
|
|
float x;
|
|
float y;
|
|
} IDPassBufferElement;
|
|
|
|
ccl_device_inline void kernel_write_id_slots(ccl_global float *buffer,
|
|
int num_slots,
|
|
float id,
|
|
float weight)
|
|
{
|
|
kernel_assert(id != ID_NONE);
|
|
if (weight == 0.0f) {
|
|
return;
|
|
}
|
|
|
|
for (int slot = 0; slot < num_slots; slot++) {
|
|
ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
|
|
#ifdef __ATOMIC_PASS_WRITE__
|
|
/* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
|
|
if (id_buffer[slot].x == ID_NONE) {
|
|
/* Use an atomic to claim this slot.
|
|
* If a different thread got here first, try again from this slot on. */
|
|
float old_id = atomic_compare_and_swap_float(buffer + slot * 2, ID_NONE, id);
|
|
if (old_id != ID_NONE && old_id != id) {
|
|
continue;
|
|
}
|
|
atomic_add_and_fetch_float(buffer + slot * 2 + 1, weight);
|
|
break;
|
|
}
|
|
/* If there already is a slot for that ID, add the weight.
|
|
* If no slot was found, add it to the last. */
|
|
else if (id_buffer[slot].x == id || slot == num_slots - 1) {
|
|
atomic_add_and_fetch_float(buffer + slot * 2 + 1, weight);
|
|
break;
|
|
}
|
|
#else /* __ATOMIC_PASS_WRITE__ */
|
|
/* If the loop reaches an empty slot, the ID isn't in any slot yet - so add it! */
|
|
if (id_buffer[slot].x == ID_NONE) {
|
|
id_buffer[slot].x = id;
|
|
id_buffer[slot].y = weight;
|
|
break;
|
|
}
|
|
/* If there already is a slot for that ID, add the weight.
|
|
* If no slot was found, add it to the last. */
|
|
else if (id_buffer[slot].x == id || slot == num_slots - 1) {
|
|
id_buffer[slot].y += weight;
|
|
break;
|
|
}
|
|
#endif /* __ATOMIC_PASS_WRITE__ */
|
|
}
|
|
}
|
|
|
|
ccl_device_inline void kernel_sort_id_slots(ccl_global float *buffer, int num_slots)
|
|
{
|
|
ccl_global IDPassBufferElement *id_buffer = (ccl_global IDPassBufferElement *)buffer;
|
|
for (int slot = 1; slot < num_slots; ++slot) {
|
|
if (id_buffer[slot].x == ID_NONE) {
|
|
return;
|
|
}
|
|
/* Since we're dealing with a tiny number of elements, insertion sort should be fine. */
|
|
int i = slot;
|
|
while (i > 0 && id_buffer[i].y > id_buffer[i - 1].y) {
|
|
const IDPassBufferElement swap = id_buffer[i];
|
|
id_buffer[i] = id_buffer[i - 1];
|
|
id_buffer[i - 1] = swap;
|
|
--i;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* post-sorting for Cryptomatte */
|
|
ccl_device_inline void kernel_cryptomatte_post(ccl_global const KernelGlobals *kg,
|
|
ccl_global float *render_buffer,
|
|
int pixel_index)
|
|
{
|
|
const int pass_stride = kernel_data.film.pass_stride;
|
|
const uint64_t render_buffer_offset = (uint64_t)pixel_index * pass_stride;
|
|
ccl_global float *cryptomatte_buffer = render_buffer + render_buffer_offset +
|
|
kernel_data.film.pass_cryptomatte;
|
|
kernel_sort_id_slots(cryptomatte_buffer, 2 * kernel_data.film.cryptomatte_depth);
|
|
}
|
|
|
|
CCL_NAMESPACE_END
|