This repository has been archived on 2023-10-09. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
blender-archive/intern/cycles/kernel/kernels/cuda/kernel_config.h
Brecht Van Lommel 60e5ebdd62 Fix Cycles CUDA kernels for Ampere not building with CUDA 11
Running Blender on Ampere cards was already possible with ptx, this fix is
needed to support building CUDA binaries.

Note the CUDA version used for official Blender builds is still 10, this is
merely the change to make it possible for those using CUDA 11 and specifying
the sm_8x kernels to be compiled.

Found by Milan Jaros.
2020-09-30 18:29:42 +02:00

122 lines
4.0 KiB
C++

/*
* Copyright 2011-2013 Blender Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* device data taken from CUDA occupancy calculator */
/* 3.0 and 3.5 */
#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 3.2 */
#elif __CUDA_ARCH__ == 320
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 63
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 3.7 */
#elif __CUDA_ARCH__ == 370
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 255
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 63
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 5.x, 6.x */
#elif __CUDA_ARCH__ <= 699
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 255
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
* registers */
# if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
# define CUDA_KERNEL_MAX_REGISTERS 64
# else
# define CUDA_KERNEL_MAX_REGISTERS 48
# endif
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
/* 7.x, 8.x */
#elif __CUDA_ARCH__ <= 899
# define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
# define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
# define CUDA_BLOCK_MAX_THREADS 1024
# define CUDA_THREAD_MAX_REGISTERS 255
/* tunable parameters */
# define CUDA_THREADS_BLOCK_WIDTH 16
# define CUDA_KERNEL_MAX_REGISTERS 64
# define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72
/* unknown architecture */
#else
# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
#endif
/* For split kernel using all registers seems fastest for now, but this
* is unlikely to be optimal once we resolve other bottlenecks. */
#define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
/* Compute number of threads per block and minimum blocks per multiprocessor
* given the maximum number of registers per thread. */
#define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
__launch_bounds__(threads_block_width *threads_block_width, \
CUDA_MULTIPRESSOR_MAX_REGISTERS / \
(threads_block_width * threads_block_width * thread_num_registers))
/* sanity checks */
#if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
# error "Maximum number of threads per block exceeded"
#endif
#if CUDA_MULTIPRESSOR_MAX_REGISTERS / \
(CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \
CUDA_MULTIPROCESSOR_MAX_BLOCKS
# error "Maximum number of blocks per multiprocessor exceeded"
#endif
#if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
# error "Maximum number of registers per thread exceeded"
#endif
#if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
# error "Maximum number of registers per thread exceeded"
#endif