This repository has been archived on 2023-10-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
blender-archive/intern/cycles/kernel/device/cuda/compat.h
Michael Jones (Apple) a0f269f682 Cycles: Kernel address space changes for MSL
This is the first of a sequence of changes to support compiling Cycles kernels as MSL (Metal Shading Language) in preparation for a Metal GPU device implementation.

MSL requires that all pointer types be declared with explicit address space attributes (device, thread, etc...). There is already precedent for this with Cycles' address space macros (ccl_global, ccl_private, etc...), therefore the first step of MSL-enablement is to apply these consistently. Line-for-line this represents the largest change required to enable MSL. Applying this change first will simplify future patches as well as offering the emergent benefit of enhanced descriptiveness.

The vast majority of deltas in this patch fall into one of two cases:

- Ensuring ccl_private is specified for thread-local pointer types
- Ensuring ccl_global is specified for device-wide pointer types

Additionally, the ccl_addr_space qualifier can be removed. Prior to Cycles X, ccl_addr_space was used as a context-dependent address space qualifier, but now it is either redundant (e.g. in struct typedefs), or can be replaced by ccl_global in the case of pointer types. Associated function variants (e.g. lcg_step_float_addrspace) are also redundant.

In cases where address space qualifiers are chained with "const", this patch places the address space qualifier first. The rationale for this is that the choice of address space is likely to have the greater impact on runtime performance and overall architecture.

The final part of this patch is the addition of a metal/compat.h header. This is partially complete and will be extended in future patches, paving the way for the full Metal implementation.

Ref T92212

Reviewed By: brecht

Maniphest Tasks: T92212

Differential Revision: https://developer.blender.org/D12864
2021-10-14 16:14:43 +01:00

135 lines
3.8 KiB
C++

/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#define __KERNEL_GPU__
#define __KERNEL_CUDA__
#define CCL_NAMESPACE_BEGIN
#define CCL_NAMESPACE_END
#ifndef ATTR_FALLTHROUGH
# define ATTR_FALLTHROUGH
#endif
/* Manual definitions so we can compile without CUDA toolkit. */
#ifdef __CUDACC_RTC__
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#else
# include <stdint.h>
#endif
#ifdef CYCLES_CUBIN_CC
# define FLT_MIN 1.175494350822287507969e-38f
# define FLT_MAX 340282346638528859811704183484516925440.0f
# define FLT_EPSILON 1.192092896e-07F
#endif
/* Qualifiers */
#define ccl_device __device__ __inline__
#if __CUDA_ARCH__ < 500
# define ccl_device_inline __device__ __forceinline__
# define ccl_device_forceinline __device__ __forceinline__
#else
# define ccl_device_inline __device__ __inline__
# define ccl_device_forceinline __device__ __forceinline__
#endif
#define ccl_device_noinline __device__ __noinline__
#define ccl_device_noinline_cpu ccl_device
#define ccl_global
#define ccl_static_constant __constant__
#define ccl_device_constant __constant__ __device__
#define ccl_constant const
#define ccl_gpu_shared __shared__
#define ccl_private
#define ccl_may_alias
#define ccl_restrict __restrict__
#define ccl_loop_no_unroll
#define ccl_align(n) __align__(n)
#define ccl_optional_struct_init
/* No assert supported for CUDA */
#define kernel_assert(cond)
/* GPU thread, block, grid size and index */
#define ccl_gpu_thread_idx_x (threadIdx.x)
#define ccl_gpu_block_dim_x (blockDim.x)
#define ccl_gpu_block_idx_x (blockIdx.x)
#define ccl_gpu_grid_dim_x (gridDim.x)
#define ccl_gpu_warp_size (warpSize)
#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
/* GPU warp synchronization. */
#define ccl_gpu_syncthreads() __syncthreads()
#define ccl_gpu_ballot(predicate) __ballot_sync(0xFFFFFFFF, predicate)
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down_sync(mask, var, detla)
#define ccl_gpu_popc(x) __popc(x)
/* GPU texture objects */
typedef unsigned long long CUtexObject;
typedef CUtexObject ccl_gpu_tex_object;
template<typename T>
ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
const float x,
const float y)
{
return tex2D<T>(texobj, x, y);
}
template<typename T>
ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
const float x,
const float y,
const float z)
{
return tex3D<T>(texobj, x, y, z);
}
/* Use fast math functions */
#define cosf(x) __cosf(((float)(x)))
#define sinf(x) __sinf(((float)(x)))
#define powf(x, y) __powf(((float)(x)), ((float)(y)))
#define tanf(x) __tanf(((float)(x)))
#define logf(x) __logf(((float)(x)))
#define expf(x) __expf(((float)(x)))
/* Half */
typedef unsigned short half;
__device__ half __float2half(const float f)
{
half val;
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(val) : "f"(f));
return val;
}
/* Types */
#include "util/util_half.h"
#include "util/util_types.h"