Compare commits
292 Commits
soc-2021-k
...
temp-node-
Author | SHA1 | Date | |
---|---|---|---|
cb6b0d088a | |||
3453b22b1e | |||
9628ef4135 | |||
80d7cac22d | |||
ae0f944a57 | |||
5f632f9f6e | |||
af13168a3f | |||
e9dac3eab8 | |||
fdcae48663 | |||
2c2516bfc9 | |||
66e24ce35b | |||
84dcf12ceb | |||
81f552e9ad | |||
cd03f5b6e5 | |||
0cddbcf1d7 | |||
a2e321aa6d | |||
d3d021601d | |||
19785cb022 | |||
![]() |
22c61e8060 | ||
6f23e4484d | |||
1d478851f8 | |||
214baf5422 | |||
45a312fd8f | |||
9d9f205dc4 | |||
df9120b365 | |||
6aac892fad | |||
367775ac6a | |||
4d4113adc2 | |||
fe070fe33b | |||
901fa96b7f | |||
1f4545dc9c | |||
b80ed8396d | |||
c33a005297 | |||
ef29bf9023 | |||
![]() |
5cebcb415e | ||
ffb9577ac9 | |||
731325a022 | |||
adaf4f56e1 | |||
![]() |
78b9a8c7b9 | ||
6351c73b75 | |||
0c32e3b312 | |||
4cf4bb2664 | |||
eabb134840 | |||
24a965bb16 | |||
f51bef75f4 | |||
8cbec0beb2 | |||
![]() |
a285299ebb | ||
![]() |
bf06f76be6 | ||
008ae26712 | |||
eddc3f5bc8 | |||
756c22bb41 | |||
d1220f795f | |||
8e40bb2dea | |||
84251acfcc | |||
e81533b25e | |||
f0f70729b1 | |||
7f5d62dfc6 | |||
4a48482247 | |||
960b21e1d7 | |||
6dceaafe5a | |||
b524153d61 | |||
79290f5160 | |||
efe3a13b55 | |||
9f0a3a99ab | |||
283d76a70d | |||
0830211c95 | |||
f84fb12f5d | |||
76377f0176 | |||
2ecd963d87 | |||
5cdb2aadfc | |||
![]() |
c7a7c3f5e5 | ||
87e315c237 | |||
![]() |
85aac0ef6a | ||
cc653c9b02 | |||
95fca22bfe | |||
86ec9d79ec | |||
e45ffce5fa | |||
![]() |
faedfd5740 | ||
![]() |
044a77352f | ||
262b211856 | |||
797064544e | |||
44e4f077a9 | |||
b32b38b380 | |||
f35ea668a1 | |||
e694165123 | |||
6ee2f2da96 | |||
10d926cd4a | |||
3acf3e9e2f | |||
330a04d7c7 | |||
53fa4801a0 | |||
34ba6968b2 | |||
f17ca53cdd | |||
640c4ace0b | |||
ff7e67afd5 | |||
728ae33f37 | |||
52a702468a | |||
d2004326a1 | |||
3e78c9e5bb | |||
3674347849 | |||
73b2ecb297 | |||
![]() |
9490db1ad2 | ||
6a745e54f6 | |||
5d160dec3b | |||
b3431a8846 | |||
6f29801f1b | |||
b91946780c | |||
741fa8180c | |||
e5ff9f3615 | |||
a64782b133 | |||
7cd43a9d28 | |||
def8fd6330 | |||
e7b9423623 | |||
c7d94a7827 | |||
986d60490c | |||
c53ffda8a4 | |||
f94164d896 | |||
5d70a4d7ee | |||
e6aabcae14 | |||
50b7253257 | |||
8da23fd5aa | |||
8967bcb755 | |||
847d355cab | |||
c76ccd85be | |||
11bfbc3337 | |||
c75c08a737 | |||
efa9667c09 | |||
10061ee18a | |||
90aa0a5256 | |||
aafbe111fc | |||
6578db57cd | |||
824733ea47 | |||
5bea5e25d5 | |||
0559971ab3 | |||
2189dfd6e2 | |||
7270ba011c | |||
4a0ddeb62b | |||
3d2ce25afd | |||
43167a2c25 | |||
5949d598bc | |||
b077f0684e | |||
a6b53ef994 | |||
2bd0205215 | |||
8fecc2a852 | |||
![]() |
0419c74ae8 | ||
4a562f5077 | |||
2a0db195c9 | |||
e87783a5ec | |||
95af9317f0 | |||
2c2e1b3d61 | |||
32ffb858d6 | |||
c618075541 | |||
a13b9d20b5 | |||
d90f542b04 | |||
5d5504d8a4 | |||
69893ef27c | |||
8dcddbcc07 | |||
30ef197c7b | |||
617954c143 | |||
547f7d23ca | |||
![]() |
f3ace5aa80 | ||
fe49904646 | |||
ddb0dc2527 | |||
10a26d583d | |||
![]() |
037e66999a | ||
d2dda0e8b9 | |||
![]() |
ad3e5d2bf5 | ||
f9e0981976 | |||
d046a1f2fa | |||
1cd8a438bb | |||
88a2b054da | |||
93b36fad68 | |||
80f7bc6d8e | |||
505422220d | |||
43394e41a8 | |||
ab09844be8 | |||
a3027fb094 | |||
b314d3e787 | |||
2dd3968335 | |||
![]() |
536f9eb82e | ||
c87e6b23be | |||
![]() |
be16794ba1 | ||
![]() |
5c0017e85a | ||
bdb8ee9717 | |||
![]() |
ef45399f3b | ||
4a2c63f4bd | |||
90b410fe74 | |||
25d4de92fa | |||
c0db8e3b41 | |||
585998987a | |||
ab8f24811d | |||
9cf593f305 | |||
45e432b894 | |||
ede14b3856 | |||
95ec6e4dd3 | |||
1a1c546124 | |||
e1e380ba38 | |||
ab9644382d | |||
2b9ca0f112 | |||
7e904139a3 | |||
2b5733ff01 | |||
dfe01628b0 | |||
e8c6e32348 | |||
59387aabe8 | |||
501b0190d6 | |||
cdcdd2c479 | |||
e7ae2840a5 | |||
d8a5b768f0 | |||
bc27bafa54 | |||
7ca48a3814 | |||
f8a0e102cf | |||
3ac342dc6d | |||
9a45a4c525 | |||
e161f39660 | |||
![]() |
fc7beac8d6 | ||
![]() |
6a88f83d67 | ||
0f764ade1a | |||
0e039749e3 | |||
bc65c7d0e5 | |||
599d96e8f9 | |||
bffda4185d | |||
26141664f0 | |||
f7608276e3 | |||
61f3d4eb7c | |||
c1b925f7ff | |||
1bdaf0ebec | |||
![]() |
7fb2b50e5d | ||
fc2255135e | |||
b8a30c7664 | |||
323fd80aad | |||
eb0eb54d96 | |||
ed541de29d | |||
354c3eee40 | |||
0a8a726014 | |||
18a4dc869d | |||
502543e46b | |||
38af29df5c | |||
105115da9f | |||
bd63944a73 | |||
942fc9f467 | |||
93997f9d0a | |||
d7f803f522 | |||
6279efbb78 | |||
d0493796a6 | |||
f11bcb5a80 | |||
490425d56e | |||
aa2493e2e7 | |||
![]() |
9b12b23d0b | ||
222fd1abf0 | |||
059d01d42e | |||
83975965a7 | |||
88692baace | |||
b659d1a560 | |||
d431b91995 | |||
4d2ca33a8a | |||
aadb7ef071 | |||
e86cf55667 | |||
f48a4aa0f9 | |||
cb173d05dc | |||
6f53988e7a | |||
ce96a75c2c | |||
3558ae3b6c | |||
b63f777950 | |||
b801e86f8b | |||
3042994c91 | |||
12924ed573 | |||
a351023bd5 | |||
754d56dcc1 | |||
bf948b2cef | |||
6e77afe6ec | |||
a78d3c5261 | |||
79bcc19240 | |||
6fb4c8f040 | |||
c99cb81452 | |||
bd01f4761c | |||
ee49991999 | |||
6611f2cb74 | |||
f893dea586 | |||
02bde2c1d5 | |||
a28ec92088 | |||
188de4bc31 | |||
bd1c4a781f | |||
3180c6b4a7 | |||
e1a0983b3c | |||
4068b6b5a7 | |||
![]() |
5033310e8a | ||
ac68b08c5b | |||
bc1e675bb9 | |||
1eba32c3e9 | |||
4762a9b09f | |||
794c2828af | |||
707bcd5693 | |||
![]() |
0d350e0193 |
@@ -419,6 +419,8 @@ mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)
|
||||
|
||||
option(WITH_CYCLES_DEVICE_CUDA "Enable Cycles CUDA compute support" ON)
|
||||
option(WITH_CYCLES_DEVICE_OPTIX "Enable Cycles OptiX support" ON)
|
||||
option(WITH_CYCLES_DEVICE_HIP "Enable Cycles HIP support" OFF)
|
||||
mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
|
||||
mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)
|
||||
|
||||
option(WITH_CUDA_DYNLOAD "Dynamically load CUDA libraries at runtime" ON)
|
||||
@@ -821,6 +823,11 @@ if(NOT WITH_CUDA_DYNLOAD)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_DEVICE_HIP)
|
||||
# Currently HIP must be dynamically loaded, this may change in future toolkits
|
||||
set(WITH_HIP_DYNLOAD ON)
|
||||
endif()
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# Check check if submodules are cloned
|
||||
|
||||
@@ -1850,6 +1857,9 @@ elseif(WITH_CYCLES_STANDALONE)
|
||||
if(WITH_CUDA_DYNLOAD)
|
||||
add_subdirectory(extern/cuew)
|
||||
endif()
|
||||
if(WITH_HIP_DYNLOAD)
|
||||
add_subdirectory(extern/hipew)
|
||||
endif()
|
||||
if(NOT WITH_SYSTEM_GLEW)
|
||||
add_subdirectory(extern/glew)
|
||||
endif()
|
||||
|
@@ -70,16 +70,18 @@
|
||||
}
|
||||
--- a/libavcodec/rl.c
|
||||
+++ b/libavcodec/rl.c
|
||||
@@ -71,7 +71,7 @@ av_cold void ff_rl_init(RLTable *rl,
|
||||
@@ -71,17 +71,19 @@
|
||||
av_cold void ff_rl_init_vlc(RLTable *rl, unsigned static_size)
|
||||
{
|
||||
int i, q;
|
||||
- VLC_TYPE table[1500][2] = {{0}};
|
||||
+ VLC_TYPE (*table)[2] = av_calloc(sizeof(VLC_TYPE), 1500 * 2);
|
||||
VLC vlc = { .table = table, .table_allocated = static_size };
|
||||
av_assert0(static_size <= FF_ARRAY_ELEMS(table));
|
||||
- av_assert0(static_size <= FF_ARRAY_ELEMS(table));
|
||||
+ av_assert0(static_size < 1500);
|
||||
init_vlc(&vlc, 9, rl->n + 1, &rl->table_vlc[0][1], 4, 2, &rl->table_vlc[0][0], 4, 2, INIT_VLC_USE_NEW_STATIC);
|
||||
@@ -80,8 +80,10 @@ av_cold void ff_rl_init_vlc(RLTable *rl, unsigned static_size)
|
||||
|
||||
for (q = 0; q < 32; q++) {
|
||||
int qmul = q * 2;
|
||||
int qadd = (q - 1) | 1;
|
||||
|
||||
@@ -91,7 +93,7 @@
|
||||
|
||||
if (q == 0) {
|
||||
qmul = 1;
|
||||
@@ -113,4 +115,5 @@ av_cold void ff_rl_init_vlc(RLTable *rl, unsigned static_size)
|
||||
@@ -113,4 +115,5 @@
|
||||
rl->rl_vlc[q][i].run = run;
|
||||
}
|
||||
}
|
||||
|
5
extern/CMakeLists.txt
vendored
5
extern/CMakeLists.txt
vendored
@@ -67,9 +67,12 @@ endif()
|
||||
|
||||
if(WITH_CYCLES OR WITH_COMPOSITOR OR WITH_OPENSUBDIV)
|
||||
add_subdirectory(clew)
|
||||
if(WITH_CUDA_DYNLOAD)
|
||||
if((WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX) AND WITH_CUDA_DYNLOAD)
|
||||
add_subdirectory(cuew)
|
||||
endif()
|
||||
if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
|
||||
add_subdirectory(hipew)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_GHOST_X11 AND WITH_GHOST_XDND)
|
||||
|
8
extern/audaspace/bindings/C/AUD_Sound.cpp
vendored
8
extern/audaspace/bindings/C/AUD_Sound.cpp
vendored
@@ -101,6 +101,8 @@ AUD_API int AUD_Sound_getFileStreams(AUD_Sound* sound, AUD_StreamInfo **stream_i
|
||||
std::shared_ptr<File> file = std::dynamic_pointer_cast<File>(*sound);
|
||||
|
||||
if(file)
|
||||
{
|
||||
try
|
||||
{
|
||||
auto streams = file->queryStreams();
|
||||
|
||||
@@ -117,12 +119,14 @@ AUD_API int AUD_Sound_getFileStreams(AUD_Sound* sound, AUD_StreamInfo **stream_i
|
||||
|
||||
return streams.size();
|
||||
}
|
||||
else
|
||||
catch(Exception&)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
*stream_infos = nullptr;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
AUD_API sample_t* AUD_Sound_data(AUD_Sound* sound, int* length, AUD_Specs* specs)
|
||||
{
|
||||
|
39
extern/hipew/CMakeLists.txt
vendored
Normal file
39
extern/hipew/CMakeLists.txt
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
# ***** BEGIN GPL LICENSE BLOCK *****
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License
|
||||
# as published by the Free Software Foundation; either version 2
|
||||
# of the License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software Foundation,
|
||||
# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
#
|
||||
# The Original Code is Copyright (C) 2021, Blender Foundation
|
||||
# All rights reserved.
|
||||
# ***** END GPL LICENSE BLOCK *****
|
||||
|
||||
set(INC
|
||||
.
|
||||
include
|
||||
)
|
||||
|
||||
set(INC_SYS
|
||||
|
||||
)
|
||||
|
||||
set(SRC
|
||||
src/hipew.c
|
||||
|
||||
include/hipew.h
|
||||
)
|
||||
|
||||
set(LIB
|
||||
)
|
||||
|
||||
blender_add_lib(extern_hipew "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
|
1207
extern/hipew/include/hipew.h
vendored
Normal file
1207
extern/hipew/include/hipew.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
533
extern/hipew/src/hipew.c
vendored
Normal file
533
extern/hipew/src/hipew.c
vendored
Normal file
@@ -0,0 +1,533 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License
|
||||
*/
|
||||
#ifdef _MSC_VER
|
||||
# if _MSC_VER < 1900
|
||||
# define snprintf _snprintf
|
||||
# endif
|
||||
# define popen _popen
|
||||
# define pclose _pclose
|
||||
# define _CRT_SECURE_NO_WARNINGS
|
||||
#endif
|
||||
|
||||
#include <hipew.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
# define WIN32_LEAN_AND_MEAN
|
||||
# define VC_EXTRALEAN
|
||||
# include <windows.h>
|
||||
|
||||
/* Utility macros. */
|
||||
|
||||
typedef HMODULE DynamicLibrary;
|
||||
|
||||
# define dynamic_library_open(path) LoadLibraryA(path)
|
||||
# define dynamic_library_close(lib) FreeLibrary(lib)
|
||||
# define dynamic_library_find(lib, symbol) GetProcAddress(lib, symbol)
|
||||
#else
|
||||
# include <dlfcn.h>
|
||||
|
||||
typedef void* DynamicLibrary;
|
||||
|
||||
# define dynamic_library_open(path) dlopen(path, RTLD_NOW)
|
||||
# define dynamic_library_close(lib) dlclose(lib)
|
||||
# define dynamic_library_find(lib, symbol) dlsym(lib, symbol)
|
||||
#endif
|
||||
|
||||
#define _LIBRARY_FIND_CHECKED(lib, name) \
|
||||
name = (t##name *)dynamic_library_find(lib, #name); \
|
||||
assert(name);
|
||||
|
||||
#define _LIBRARY_FIND(lib, name) \
|
||||
name = (t##name *)dynamic_library_find(lib, #name);
|
||||
|
||||
#define HIP_LIBRARY_FIND_CHECKED(name) \
|
||||
_LIBRARY_FIND_CHECKED(hip_lib, name)
|
||||
#define HIP_LIBRARY_FIND(name) _LIBRARY_FIND(hip_lib, name)
|
||||
|
||||
|
||||
static DynamicLibrary hip_lib;
|
||||
|
||||
/* Function definitions. */
|
||||
thipGetErrorName *hipGetErrorName;
|
||||
thipInit *hipInit;
|
||||
thipDriverGetVersion *hipDriverGetVersion;
|
||||
thipGetDevice *hipGetDevice;
|
||||
thipGetDeviceCount *hipGetDeviceCount;
|
||||
thipDeviceGetName *hipDeviceGetName;
|
||||
thipDeviceGetAttribute *hipDeviceGetAttribute;
|
||||
thipDeviceComputeCapability *hipDeviceComputeCapability;
|
||||
thipDevicePrimaryCtxRetain *hipDevicePrimaryCtxRetain;
|
||||
thipDevicePrimaryCtxRelease *hipDevicePrimaryCtxRelease;
|
||||
thipDevicePrimaryCtxSetFlags *hipDevicePrimaryCtxSetFlags;
|
||||
thipDevicePrimaryCtxGetState *hipDevicePrimaryCtxGetState;
|
||||
thipDevicePrimaryCtxReset *hipDevicePrimaryCtxReset;
|
||||
thipCtxCreate *hipCtxCreate;
|
||||
thipCtxDestroy *hipCtxDestroy;
|
||||
thipCtxPushCurrent *hipCtxPushCurrent;
|
||||
thipCtxPopCurrent *hipCtxPopCurrent;
|
||||
thipCtxSetCurrent *hipCtxSetCurrent;
|
||||
thipCtxGetCurrent *hipCtxGetCurrent;
|
||||
thipCtxGetDevice *hipCtxGetDevice;
|
||||
thipCtxGetFlags *hipCtxGetFlags;
|
||||
thipCtxSynchronize *hipCtxSynchronize;
|
||||
thipDeviceSynchronize *hipDeviceSynchronize;
|
||||
thipCtxGetCacheConfig *hipCtxGetCacheConfig;
|
||||
thipCtxSetCacheConfig *hipCtxSetCacheConfig;
|
||||
thipCtxGetSharedMemConfig *hipCtxGetSharedMemConfig;
|
||||
thipCtxSetSharedMemConfig *hipCtxSetSharedMemConfig;
|
||||
thipCtxGetApiVersion *hipCtxGetApiVersion;
|
||||
thipModuleLoad *hipModuleLoad;
|
||||
thipModuleLoadData *hipModuleLoadData;
|
||||
thipModuleLoadDataEx *hipModuleLoadDataEx;
|
||||
thipModuleUnload *hipModuleUnload;
|
||||
thipModuleGetFunction *hipModuleGetFunction;
|
||||
thipModuleGetGlobal *hipModuleGetGlobal;
|
||||
thipModuleGetTexRef *hipModuleGetTexRef;
|
||||
thipMemGetInfo *hipMemGetInfo;
|
||||
thipMalloc *hipMalloc;
|
||||
thipMemAllocPitch *hipMemAllocPitch;
|
||||
thipFree *hipFree;
|
||||
thipMemGetAddressRange *hipMemGetAddressRange;
|
||||
thipHostMalloc *hipHostMalloc;
|
||||
thipHostFree *hipHostFree;
|
||||
thipHostGetDevicePointer *hipHostGetDevicePointer;
|
||||
thipHostGetFlags *hipHostGetFlags;
|
||||
thipMallocManaged *hipMallocManaged;
|
||||
thipDeviceGetByPCIBusId *hipDeviceGetByPCIBusId;
|
||||
thipDeviceGetPCIBusId *hipDeviceGetPCIBusId;
|
||||
thipMemcpyPeer *hipMemcpyPeer;
|
||||
thipMemcpyHtoD *hipMemcpyHtoD;
|
||||
thipMemcpyDtoH *hipMemcpyDtoH;
|
||||
thipMemcpyDtoD *hipMemcpyDtoD;
|
||||
thipDrvMemcpy2DUnaligned *hipDrvMemcpy2DUnaligned;
|
||||
thipMemcpyParam2D *hipMemcpyParam2D;
|
||||
thipDrvMemcpy3D *hipDrvMemcpy3D;
|
||||
thipMemcpyHtoDAsync *hipMemcpyHtoDAsync;
|
||||
thipMemcpyDtoHAsync *hipMemcpyDtoHAsync;
|
||||
thipMemcpyParam2DAsync *hipMemcpyParam2DAsync;
|
||||
thipDrvMemcpy3DAsync *hipDrvMemcpy3DAsync;
|
||||
thipMemsetD8 *hipMemsetD8;
|
||||
thipMemsetD16 *hipMemsetD16;
|
||||
thipMemsetD32 *hipMemsetD32;
|
||||
thipMemsetD8Async *hipMemsetD8Async;
|
||||
thipMemsetD16Async *hipMemsetD16Async;
|
||||
thipMemsetD32Async *hipMemsetD32Async;
|
||||
thipArrayCreate *hipArrayCreate;
|
||||
thipArrayDestroy *hipArrayDestroy;
|
||||
thipArray3DCreate *hipArray3DCreate;
|
||||
thipStreamCreateWithFlags *hipStreamCreateWithFlags;
|
||||
thipStreamCreateWithPriority *hipStreamCreateWithPriority;
|
||||
thipStreamGetPriority *hipStreamGetPriority;
|
||||
thipStreamGetFlags *hipStreamGetFlags;
|
||||
thipStreamWaitEvent *hipStreamWaitEvent;
|
||||
thipStreamAddCallback *hipStreamAddCallback;
|
||||
thipStreamQuery *hipStreamQuery;
|
||||
thipStreamSynchronize *hipStreamSynchronize;
|
||||
thipStreamDestroy *hipStreamDestroy;
|
||||
thipEventCreateWithFlags *hipEventCreateWithFlags;
|
||||
thipEventRecord *hipEventRecord;
|
||||
thipEventQuery *hipEventQuery;
|
||||
thipEventSynchronize *hipEventSynchronize;
|
||||
thipEventDestroy *hipEventDestroy;
|
||||
thipEventElapsedTime *hipEventElapsedTime;
|
||||
thipFuncGetAttribute *hipFuncGetAttribute;
|
||||
thipFuncSetCacheConfig *hipFuncSetCacheConfig;
|
||||
thipModuleLaunchKernel *hipModuleLaunchKernel;
|
||||
thipDrvOccupancyMaxActiveBlocksPerMultiprocessor *hipDrvOccupancyMaxActiveBlocksPerMultiprocessor;
|
||||
thipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags *hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
|
||||
thipModuleOccupancyMaxPotentialBlockSize *hipModuleOccupancyMaxPotentialBlockSize;
|
||||
thipTexRefSetArray *hipTexRefSetArray;
|
||||
thipTexRefSetAddress *hipTexRefSetAddress;
|
||||
thipTexRefSetAddress2D *hipTexRefSetAddress2D;
|
||||
thipTexRefSetFormat *hipTexRefSetFormat;
|
||||
thipTexRefSetAddressMode *hipTexRefSetAddressMode;
|
||||
thipTexRefSetFilterMode *hipTexRefSetFilterMode;
|
||||
thipTexRefSetFlags *hipTexRefSetFlags;
|
||||
thipTexRefGetAddress *hipTexRefGetAddress;
|
||||
thipTexRefGetArray *hipTexRefGetArray;
|
||||
thipTexRefGetAddressMode *hipTexRefGetAddressMode;
|
||||
thipTexObjectCreate *hipTexObjectCreate;
|
||||
thipTexObjectDestroy *hipTexObjectDestroy;
|
||||
thipDeviceCanAccessPeer *hipDeviceCanAccessPeer;
|
||||
|
||||
thipCtxEnablePeerAccess *hipCtxEnablePeerAccess;
|
||||
thipCtxDisablePeerAccess *hipCtxDisablePeerAccess;
|
||||
thipDeviceGetP2PAttribute *hipDeviceGetP2PAttribute;
|
||||
thipGraphicsUnregisterResource *hipGraphicsUnregisterResource;
|
||||
thipGraphicsMapResources *hipGraphicsMapResources;
|
||||
thipGraphicsUnmapResources *hipGraphicsUnmapResources;
|
||||
thipGraphicsResourceGetMappedPointer *hipGraphicsResourceGetMappedPointer;
|
||||
|
||||
thipGraphicsGLRegisterBuffer *hipGraphicsGLRegisterBuffer;
|
||||
thipGLGetDevices *hipGLGetDevices;
|
||||
|
||||
|
||||
|
||||
static DynamicLibrary dynamic_library_open_find(const char **paths) {
|
||||
int i = 0;
|
||||
while (paths[i] != NULL) {
|
||||
DynamicLibrary lib = dynamic_library_open(paths[i]);
|
||||
if (lib != NULL) {
|
||||
return lib;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Implementation function. */
|
||||
static void hipewHipExit(void) {
|
||||
if (hip_lib != NULL) {
|
||||
/* Ignore errors. */
|
||||
dynamic_library_close(hip_lib);
|
||||
hip_lib = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static int hipewHipInit(void) {
|
||||
/* Library paths. */
|
||||
#ifdef _WIN32
|
||||
/* Expected in c:/windows/system or similar, no path needed. */
|
||||
const char *hip_paths[] = {"amdhip64.dll", NULL};
|
||||
#elif defined(__APPLE__)
|
||||
/* Default installation path. */
|
||||
const char *hip_paths[] = {"", NULL};
|
||||
#else
|
||||
const char *hip_paths[] = {"/opt/rocm/hip/lib/libamdhip64.so", NULL};
|
||||
#endif
|
||||
static int initialized = 0;
|
||||
static int result = 0;
|
||||
int error, driver_version;
|
||||
|
||||
if (initialized) {
|
||||
return result;
|
||||
}
|
||||
|
||||
initialized = 1;
|
||||
|
||||
error = atexit(hipewHipExit);
|
||||
if (error) {
|
||||
result = HIPEW_ERROR_ATEXIT_FAILED;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Load library. */
|
||||
hip_lib = dynamic_library_open_find(hip_paths);
|
||||
|
||||
if (hip_lib == NULL) {
|
||||
result = HIPEW_ERROR_OPEN_FAILED;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Fetch all function pointers. */
|
||||
HIP_LIBRARY_FIND_CHECKED(hipGetErrorName);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipInit);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDriverGetVersion);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipGetDevice);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipGetDeviceCount);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDeviceGetName);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDeviceGetAttribute);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDeviceComputeCapability);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxRetain);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxRelease);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxSetFlags);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxGetState);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxReset);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxCreate);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxDestroy);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxPushCurrent);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxPopCurrent);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxSetCurrent);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxGetCurrent);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxGetDevice);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxGetFlags);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxSynchronize);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDeviceSynchronize);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxGetCacheConfig);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxSetCacheConfig);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxGetSharedMemConfig);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxSetSharedMemConfig);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxGetApiVersion);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipModuleLoad);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipModuleLoadData);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipModuleLoadDataEx);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipModuleUnload);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipModuleGetFunction);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipModuleGetGlobal);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipModuleGetTexRef);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemGetInfo);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMalloc);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemAllocPitch);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipFree);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemGetAddressRange);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipHostMalloc);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipHostFree);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipHostGetDevicePointer);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipHostGetFlags);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMallocManaged);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDeviceGetByPCIBusId);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDeviceGetPCIBusId);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemcpyPeer);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemcpyHtoD);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoH);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoD);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemcpyParam2D);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy3D);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemcpyHtoDAsync);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoHAsync);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy2DUnaligned);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemcpyParam2DAsync);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy3DAsync);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemsetD8);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemsetD16);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemsetD32);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemsetD8Async);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemsetD16Async);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipMemsetD32Async);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipArrayCreate);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipArrayDestroy);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipArray3DCreate);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipStreamCreateWithFlags);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipStreamCreateWithPriority);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipStreamGetPriority);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipStreamGetFlags);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipStreamWaitEvent);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipStreamAddCallback);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipStreamQuery);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipStreamSynchronize);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipStreamDestroy);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipEventCreateWithFlags);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipEventRecord);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipEventQuery);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipEventSynchronize);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipEventDestroy);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipEventElapsedTime);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipFuncGetAttribute);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipFuncSetCacheConfig);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipModuleLaunchKernel);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipModuleOccupancyMaxPotentialBlockSize);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexRefSetArray);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddress);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddress2D);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFormat);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddressMode);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFilterMode);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFlags);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexRefGetAddress);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexRefGetAddressMode);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexObjectCreate);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipTexObjectDestroy);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDeviceCanAccessPeer);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxEnablePeerAccess);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipCtxDisablePeerAccess);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipDeviceGetP2PAttribute);
|
||||
#ifdef _WIN32
|
||||
HIP_LIBRARY_FIND_CHECKED(hipGraphicsUnregisterResource);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipGraphicsMapResources);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipGraphicsUnmapResources);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipGraphicsResourceGetMappedPointer);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipGraphicsGLRegisterBuffer);
|
||||
HIP_LIBRARY_FIND_CHECKED(hipGLGetDevices);
|
||||
#endif
|
||||
result = HIPEW_SUCCESS;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int hipewInit(hipuint32_t flags) {
|
||||
int result = HIPEW_SUCCESS;
|
||||
|
||||
if (flags & HIPEW_INIT_HIP) {
|
||||
result = hipewHipInit();
|
||||
if (result != HIPEW_SUCCESS) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
const char *hipewErrorString(hipError_t result) {
|
||||
switch (result) {
|
||||
case hipSuccess: return "No errors";
|
||||
case hipErrorInvalidValue: return "Invalid value";
|
||||
case hipErrorOutOfMemory: return "Out of memory";
|
||||
case hipErrorNotInitialized: return "Driver not initialized";
|
||||
case hipErrorDeinitialized: return "Driver deinitialized";
|
||||
case hipErrorProfilerDisabled: return "Profiler disabled";
|
||||
case hipErrorProfilerNotInitialized: return "Profiler not initialized";
|
||||
case hipErrorProfilerAlreadyStarted: return "Profiler already started";
|
||||
case hipErrorProfilerAlreadyStopped: return "Profiler already stopped";
|
||||
case hipErrorNoDevice: return "No HIP-capable device available";
|
||||
case hipErrorInvalidDevice: return "Invalid device";
|
||||
case hipErrorInvalidImage: return "Invalid kernel image";
|
||||
case hipErrorInvalidContext: return "Invalid context";
|
||||
case hipErrorContextAlreadyCurrent: return "Context already current";
|
||||
case hipErrorMapFailed: return "Map failed";
|
||||
case hipErrorUnmapFailed: return "Unmap failed";
|
||||
case hipErrorArrayIsMapped: return "Array is mapped";
|
||||
case hipErrorAlreadyMapped: return "Already mapped";
|
||||
case hipErrorNoBinaryForGpu: return "No binary for GPU";
|
||||
case hipErrorAlreadyAcquired: return "Already acquired";
|
||||
case hipErrorNotMapped: return "Not mapped";
|
||||
case hipErrorNotMappedAsArray: return "Mapped resource not available for access as an array";
|
||||
case hipErrorNotMappedAsPointer: return "Mapped resource not available for access as a pointer";
|
||||
case hipErrorECCNotCorrectable: return "Uncorrectable ECC error detected";
|
||||
case hipErrorUnsupportedLimit: return "hipLimit_t not supported by device";
|
||||
case hipErrorContextAlreadyInUse: return "Context already in use";
|
||||
case hipErrorPeerAccessUnsupported: return "Peer access unsupported";
|
||||
case hipErrorInvalidKernelFile: return "Invalid ptx";
|
||||
case hipErrorInvalidGraphicsContext: return "Invalid graphics context";
|
||||
case hipErrorInvalidSource: return "Invalid source";
|
||||
case hipErrorFileNotFound: return "File not found";
|
||||
case hipErrorSharedObjectSymbolNotFound: return "Link to a shared object failed to resolve";
|
||||
case hipErrorSharedObjectInitFailed: return "Shared object initialization failed";
|
||||
case hipErrorOperatingSystem: return "Operating system";
|
||||
case hipErrorInvalidHandle: return "Invalid handle";
|
||||
case hipErrorNotFound: return "Not found";
|
||||
case hipErrorNotReady: return "HIP not ready";
|
||||
case hipErrorIllegalAddress: return "Illegal address";
|
||||
case hipErrorLaunchOutOfResources: return "Launch exceeded resources";
|
||||
case hipErrorLaunchTimeOut: return "Launch exceeded timeout";
|
||||
case hipErrorPeerAccessAlreadyEnabled: return "Peer access already enabled";
|
||||
case hipErrorPeerAccessNotEnabled: return "Peer access not enabled";
|
||||
case hipErrorSetOnActiveProcess: return "Primary context active";
|
||||
case hipErrorAssert: return "Assert";
|
||||
case hipErrorHostMemoryAlreadyRegistered: return "Host memory already registered";
|
||||
case hipErrorHostMemoryNotRegistered: return "Host memory not registered";
|
||||
case hipErrorLaunchFailure: return "Launch failed";
|
||||
case hipErrorCooperativeLaunchTooLarge: return "Cooperative launch too large";
|
||||
case hipErrorNotSupported: return "Not supported";
|
||||
case hipErrorUnknown: return "Unknown error";
|
||||
default: return "Unknown HIP error value";
|
||||
}
|
||||
}
|
||||
|
||||
static void path_join(const char *path1,
|
||||
const char *path2,
|
||||
int maxlen,
|
||||
char *result) {
|
||||
#if defined(WIN32) || defined(_WIN32)
|
||||
const char separator = '\\';
|
||||
#else
|
||||
const char separator = '/';
|
||||
#endif
|
||||
int n = snprintf(result, maxlen, "%s%c%s", path1, separator, path2);
|
||||
if (n != -1 && n < maxlen) {
|
||||
result[n] = '\0';
|
||||
}
|
||||
else {
|
||||
result[maxlen - 1] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
static int path_exists(const char *path) {
|
||||
struct stat st;
|
||||
if (stat(path, &st)) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char *hipewCompilerPath(void) {
|
||||
#ifdef _WIN32
|
||||
const char *hipPath = getenv("HIP_ROCCLR_HOME");
|
||||
const char *windowsCommand = "perl ";
|
||||
const char *executable = "bin/hipcc";
|
||||
|
||||
static char hipcc[65536];
|
||||
static char finalCommand[65536];
|
||||
if(hipPath) {
|
||||
path_join(hipPath, executable, sizeof(hipcc), hipcc);
|
||||
if(path_exists(hipcc)) {
|
||||
snprintf(finalCommand, sizeof(hipcc), "%s %s", windowsCommand, hipcc);
|
||||
return finalCommand;
|
||||
} else {
|
||||
printf("Could not find hipcc. Make sure HIP_ROCCLR_HOME points to the directory holding /bin/hipcc");
|
||||
}
|
||||
}
|
||||
#else
|
||||
const char *hipPath = "opt/rocm/hip/bin";
|
||||
const char *executable = "hipcc";
|
||||
|
||||
static char hipcc[65536];
|
||||
if(hipPath) {
|
||||
path_join(hipPath, executable, sizeof(hipcc), hipcc);
|
||||
if(path_exists(hipcc)){
|
||||
return hipcc;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
{
|
||||
#ifdef _WIN32
|
||||
FILE *handle = popen("where hipcc", "r");
|
||||
#else
|
||||
FILE *handle = popen("which hipcc", "r");
|
||||
#endif
|
||||
if (handle) {
|
||||
char buffer[4096] = {0};
|
||||
int len = fread(buffer, 1, sizeof(buffer) - 1, handle);
|
||||
buffer[len] = '\0';
|
||||
pclose(handle);
|
||||
if (buffer[0]) {
|
||||
return "hipcc";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int hipewCompilerVersion(void) {
|
||||
const char *path = hipewCompilerPath();
|
||||
const char *marker = "Hip compilation tools, release ";
|
||||
FILE *pipe;
|
||||
int major, minor;
|
||||
char *versionstr;
|
||||
char buf[128];
|
||||
char output[65536] = "\0";
|
||||
char command[65536] = "\0";
|
||||
|
||||
if (path == NULL) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* get --version output */
|
||||
strcat(command, "\"");
|
||||
strncat(command, path, sizeof(command) - 1);
|
||||
strncat(command, "\" --version", sizeof(command) - strlen(path) - 1);
|
||||
pipe = popen(command, "r");
|
||||
if (!pipe) {
|
||||
fprintf(stderr, "HIP: failed to run compiler to retrieve version");
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (!feof(pipe)) {
|
||||
if (fgets(buf, sizeof(buf), pipe) != NULL) {
|
||||
strncat(output, buf, sizeof(output) - strlen(output) - 1);
|
||||
}
|
||||
}
|
||||
|
||||
pclose(pipe);
|
||||
return 40;
|
||||
}
|
3
extern/tinygltf/README.blender
vendored
3
extern/tinygltf/README.blender
vendored
@@ -2,4 +2,5 @@ Project: TinyGLTF
|
||||
URL: https://github.com/syoyo/tinygltf
|
||||
License: MIT
|
||||
Upstream version: 2.5.0, 19a41d20ec0
|
||||
Local modifications: None
|
||||
Local modifications:
|
||||
* Silence "enum value not handled in switch" warnings due to JSON dependency.
|
||||
|
1
extern/tinygltf/tiny_gltf.h
vendored
1
extern/tinygltf/tiny_gltf.h
vendored
@@ -3201,6 +3201,7 @@ static bool ParseJsonAsValue(Value *ret, const json &o) {
|
||||
val = Value(o.get<double>());
|
||||
break;
|
||||
case json::value_t::null:
|
||||
case json::value_t::binary:
|
||||
case json::value_t::discarded:
|
||||
// default:
|
||||
break;
|
||||
|
@@ -297,6 +297,7 @@ endif()
|
||||
|
||||
if(WITH_CYCLES_STANDALONE)
|
||||
set(WITH_CYCLES_DEVICE_CUDA TRUE)
|
||||
set(WITH_CYCLES_DEVICE_HIP TRUE)
|
||||
endif()
|
||||
# TODO(sergey): Consider removing it, only causes confusion in interface.
|
||||
set(WITH_CYCLES_DEVICE_MULTI TRUE)
|
||||
|
@@ -53,7 +53,7 @@ struct Options {
|
||||
SessionParams session_params;
|
||||
bool quiet;
|
||||
bool show_help, interactive, pause;
|
||||
string output_path;
|
||||
string output_filepath;
|
||||
} options;
|
||||
|
||||
static void session_print(const string &str)
|
||||
@@ -160,7 +160,7 @@ static void session_init()
|
||||
/* load scene */
|
||||
scene_init();
|
||||
|
||||
options.session->reset(session_buffer_params(), options.session_params.samples);
|
||||
options.session->reset(options.session_params, session_buffer_params());
|
||||
options.session->start();
|
||||
}
|
||||
|
||||
@@ -222,9 +222,7 @@ static void display_info(Progress &progress)
|
||||
|
||||
static void display()
|
||||
{
|
||||
static DeviceDrawParams draw_params = DeviceDrawParams();
|
||||
|
||||
options.session->draw(session_buffer_params(), draw_params);
|
||||
options.session->draw();
|
||||
|
||||
display_info(options.session->progress);
|
||||
}
|
||||
@@ -254,7 +252,7 @@ static void motion(int x, int y, int button)
|
||||
options.session->scene->camera->need_flags_update = true;
|
||||
options.session->scene->camera->need_device_update = true;
|
||||
|
||||
options.session->reset(session_buffer_params(), options.session_params.samples);
|
||||
options.session->reset(options.session_params, session_buffer_params());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -271,7 +269,7 @@ static void resize(int width, int height)
|
||||
options.session->scene->camera->need_flags_update = true;
|
||||
options.session->scene->camera->need_device_update = true;
|
||||
|
||||
options.session->reset(session_buffer_params(), options.session_params.samples);
|
||||
options.session->reset(options.session_params, session_buffer_params());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -283,7 +281,7 @@ static void keyboard(unsigned char key)
|
||||
|
||||
/* Reset */
|
||||
else if (key == 'r')
|
||||
options.session->reset(session_buffer_params(), options.session_params.samples);
|
||||
options.session->reset(options.session_params, session_buffer_params());
|
||||
|
||||
/* Cancel */
|
||||
else if (key == 27) // escape
|
||||
@@ -320,7 +318,7 @@ static void keyboard(unsigned char key)
|
||||
options.session->scene->camera->need_flags_update = true;
|
||||
options.session->scene->camera->need_device_update = true;
|
||||
|
||||
options.session->reset(session_buffer_params(), options.session_params.samples);
|
||||
options.session->reset(options.session_params, session_buffer_params());
|
||||
}
|
||||
|
||||
/* Set Max Bounces */
|
||||
@@ -346,7 +344,7 @@ static void keyboard(unsigned char key)
|
||||
|
||||
options.session->scene->integrator->set_max_bounce(bounce);
|
||||
|
||||
options.session->reset(session_buffer_params(), options.session_params.samples);
|
||||
options.session->reset(options.session_params, session_buffer_params());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -361,11 +359,13 @@ static int files_parse(int argc, const char *argv[])
|
||||
|
||||
static void options_parse(int argc, const char **argv)
|
||||
{
|
||||
options.width = 0;
|
||||
options.height = 0;
|
||||
options.width = 1024;
|
||||
options.height = 512;
|
||||
options.filepath = "";
|
||||
options.session = NULL;
|
||||
options.quiet = false;
|
||||
options.session_params.use_auto_tile = false;
|
||||
options.session_params.tile_size = 0;
|
||||
|
||||
/* device names */
|
||||
string device_names = "";
|
||||
@@ -411,7 +411,7 @@ static void options_parse(int argc, const char **argv)
|
||||
&options.session_params.samples,
|
||||
"Number of samples to render",
|
||||
"--output %s",
|
||||
&options.output_path,
|
||||
&options.output_filepath,
|
||||
"File path to write output image",
|
||||
"--threads %d",
|
||||
&options.session_params.threads,
|
||||
@@ -422,12 +422,9 @@ static void options_parse(int argc, const char **argv)
|
||||
"--height %d",
|
||||
&options.height,
|
||||
"Window height in pixel",
|
||||
"--tile-width %d",
|
||||
&options.session_params.tile_size.x,
|
||||
"Tile width in pixels",
|
||||
"--tile-height %d",
|
||||
&options.session_params.tile_size.y,
|
||||
"Tile height in pixels",
|
||||
"--tile-size %d",
|
||||
&options.session_params.tile_size,
|
||||
"Tile size in pixels",
|
||||
"--list-devices",
|
||||
&list,
|
||||
"List information about all available devices",
|
||||
@@ -489,8 +486,9 @@ static void options_parse(int argc, const char **argv)
|
||||
options.session_params.background = true;
|
||||
#endif
|
||||
|
||||
/* Use progressive rendering */
|
||||
options.session_params.progressive = true;
|
||||
if (options.session_params.tile_size > 0) {
|
||||
options.session_params.use_auto_tile = true;
|
||||
}
|
||||
|
||||
/* find matching device */
|
||||
DeviceType device_type = Device::type_from_string(devicename.c_str());
|
||||
|
@@ -95,6 +95,9 @@ set(ADDON_FILES
|
||||
|
||||
add_definitions(${GL_DEFINITIONS})
|
||||
|
||||
if(WITH_CYCLES_DEVICE_HIP)
|
||||
add_definitions(-DWITH_HIP)
|
||||
endif()
|
||||
if(WITH_MOD_FLUID)
|
||||
add_definitions(-DWITH_FLUID)
|
||||
endif()
|
||||
|
@@ -28,7 +28,7 @@ def _configure_argument_parser():
|
||||
action='store_true')
|
||||
parser.add_argument("--cycles-device",
|
||||
help="Set the device to use for Cycles, overriding user preferences and the scene setting."
|
||||
"Valid options are 'CPU', 'CUDA' or 'OPTIX'."
|
||||
"Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'"
|
||||
"Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
|
||||
default=None)
|
||||
return parser
|
||||
|
@@ -111,6 +111,7 @@ enum_device_type = (
|
||||
('CPU', "CPU", "CPU", 0),
|
||||
('CUDA', "CUDA", "CUDA", 1),
|
||||
('OPTIX', "OptiX", "OptiX", 3),
|
||||
("HIP", "HIP", "HIP", 4)
|
||||
)
|
||||
|
||||
enum_texture_limit = (
|
||||
@@ -739,14 +740,14 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):
|
||||
|
||||
use_auto_tile: BoolProperty(
|
||||
name="Auto Tiles",
|
||||
description="Automatically split image into tiles",
|
||||
description="Automatically render high resolution images in tiles to reduce memory usage, using the specified tile size. Tiles are cached to disk while rendering to save memory",
|
||||
default=True,
|
||||
)
|
||||
tile_size: IntProperty(
|
||||
name="Tile Size",
|
||||
default=2048,
|
||||
description="",
|
||||
min=0, max=16384,
|
||||
min=8, max=16384,
|
||||
)
|
||||
|
||||
# Various fine-tuning debug flags
|
||||
@@ -1266,12 +1267,16 @@ class CyclesPreferences(bpy.types.AddonPreferences):
|
||||
|
||||
def get_device_types(self, context):
|
||||
import _cycles
|
||||
has_cuda, has_optix = _cycles.get_device_types()
|
||||
has_cuda, has_optix, has_hip = _cycles.get_device_types()
|
||||
|
||||
list = [('NONE', "None", "Don't use compute device", 0)]
|
||||
if has_cuda:
|
||||
list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
|
||||
if has_optix:
|
||||
list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
|
||||
if has_hip:
|
||||
list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4))
|
||||
|
||||
return list
|
||||
|
||||
compute_device_type: EnumProperty(
|
||||
@@ -1296,7 +1301,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
|
||||
|
||||
def update_device_entries(self, device_list):
|
||||
for device in device_list:
|
||||
if not device[1] in {'CUDA', 'OPTIX', 'CPU'}:
|
||||
if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}:
|
||||
continue
|
||||
# Try to find existing Device entry
|
||||
entry = self.find_existing_device_entry(device)
|
||||
@@ -1330,7 +1335,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
|
||||
elif entry.type == 'CPU':
|
||||
cpu_devices.append(entry)
|
||||
# Extend all GPU devices with CPU.
|
||||
if compute_device_type != 'CPU':
|
||||
if compute_device_type != 'CPU' and compute_device_type != 'HIP':
|
||||
devices.extend(cpu_devices)
|
||||
return devices
|
||||
|
||||
@@ -1340,7 +1345,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
|
||||
import _cycles
|
||||
# Ensure `self.devices` is not re-allocated when the second call to
|
||||
# get_devices_for_type is made, freeing items from the first list.
|
||||
for device_type in ('CUDA', 'OPTIX', 'OPENCL'):
|
||||
for device_type in ('CUDA', 'OPTIX', 'HIP'):
|
||||
self.update_device_entries(_cycles.available_devices(device_type))
|
||||
|
||||
# Deprecated: use refresh_devices instead.
|
||||
|
@@ -99,6 +99,11 @@ def use_cuda(context):
|
||||
return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU')
|
||||
|
||||
|
||||
def use_hip(context):
|
||||
cscene = context.scene.cycles
|
||||
|
||||
return (get_device_type(context) == 'HIP' and cscene.device == 'GPU')
|
||||
|
||||
def use_optix(context):
|
||||
cscene = context.scene.cycles
|
||||
|
||||
@@ -613,8 +618,8 @@ class CYCLES_RENDER_PT_performance_threads(CyclesButtonsPanel, Panel):
|
||||
sub.prop(rd, "threads")
|
||||
|
||||
|
||||
class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel):
|
||||
bl_label = "Tiles"
|
||||
class CYCLES_RENDER_PT_performance_memory(CyclesButtonsPanel, Panel):
|
||||
bl_label = "Memory"
|
||||
bl_parent_id = "CYCLES_RENDER_PT_performance"
|
||||
|
||||
def draw(self, context):
|
||||
@@ -2107,7 +2112,7 @@ classes = (
|
||||
CYCLES_RENDER_PT_film_transparency,
|
||||
CYCLES_RENDER_PT_performance,
|
||||
CYCLES_RENDER_PT_performance_threads,
|
||||
CYCLES_RENDER_PT_performance_tiles,
|
||||
CYCLES_RENDER_PT_performance_memory,
|
||||
CYCLES_RENDER_PT_performance_acceleration_structure,
|
||||
CYCLES_RENDER_PT_performance_final_render,
|
||||
CYCLES_RENDER_PT_performance_viewport,
|
||||
|
@@ -283,10 +283,13 @@ static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CDa
|
||||
return;
|
||||
|
||||
Attribute *attr_intercept = NULL;
|
||||
Attribute *attr_length = NULL;
|
||||
Attribute *attr_random = NULL;
|
||||
|
||||
if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT))
|
||||
attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT);
|
||||
if (hair->need_attribute(scene, ATTR_STD_CURVE_LENGTH))
|
||||
attr_length = hair->attributes.add(ATTR_STD_CURVE_LENGTH);
|
||||
if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM))
|
||||
attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM);
|
||||
|
||||
@@ -336,6 +339,10 @@ static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CDa
|
||||
num_curve_keys++;
|
||||
}
|
||||
|
||||
if (attr_length != NULL) {
|
||||
attr_length->add(CData->curve_length[curve]);
|
||||
}
|
||||
|
||||
if (attr_random != NULL) {
|
||||
attr_random->add(hash_uint2_to_float(num_curves, 0));
|
||||
}
|
||||
@@ -657,11 +664,15 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
|
||||
|
||||
/* Add requested attributes. */
|
||||
Attribute *attr_intercept = NULL;
|
||||
Attribute *attr_length = NULL;
|
||||
Attribute *attr_random = NULL;
|
||||
|
||||
if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT)) {
|
||||
attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT);
|
||||
}
|
||||
if (hair->need_attribute(scene, ATTR_STD_CURVE_LENGTH)) {
|
||||
attr_length = hair->attributes.add(ATTR_STD_CURVE_LENGTH);
|
||||
}
|
||||
if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM)) {
|
||||
attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM);
|
||||
}
|
||||
@@ -714,6 +725,10 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
|
||||
}
|
||||
}
|
||||
|
||||
if (attr_length) {
|
||||
attr_length->add(length);
|
||||
}
|
||||
|
||||
/* Random number per curve. */
|
||||
if (attr_random != NULL) {
|
||||
attr_random->add(hash_uint2_to_float(b_curve.index(), 0));
|
||||
|
@@ -26,6 +26,7 @@ enum ComputeDevice {
|
||||
COMPUTE_DEVICE_CPU = 0,
|
||||
COMPUTE_DEVICE_CUDA = 1,
|
||||
COMPUTE_DEVICE_OPTIX = 3,
|
||||
COMPUTE_DEVICE_HIP = 4,
|
||||
|
||||
COMPUTE_DEVICE_NUM
|
||||
};
|
||||
@@ -81,6 +82,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
|
||||
else if (compute_device == COMPUTE_DEVICE_OPTIX) {
|
||||
mask |= DEVICE_MASK_OPTIX;
|
||||
}
|
||||
else if (compute_device == COMPUTE_DEVICE_HIP) {
|
||||
mask |= DEVICE_MASK_HIP;
|
||||
}
|
||||
vector<DeviceInfo> devices = Device::available_devices(mask);
|
||||
|
||||
/* Match device preferences and available devices. */
|
||||
|
@@ -80,7 +80,9 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
|
||||
{
|
||||
/* Test if we can instance or if the object is modified. */
|
||||
Geometry::Type geom_type = determine_geom_type(b_ob_info, use_particle_hair);
|
||||
BL::ID b_key_id = (BKE_object_is_modified(b_ob_info.real_object)) ? b_ob_info.real_object :
|
||||
BL::ID b_key_id = (b_ob_info.is_real_object_data() &&
|
||||
BKE_object_is_modified(b_ob_info.real_object)) ?
|
||||
b_ob_info.real_object :
|
||||
b_ob_info.object_data;
|
||||
GeometryKey key(b_key_id.ptr.data, geom_type);
|
||||
|
||||
|
@@ -485,12 +485,6 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams ¶ms)
|
||||
/* See do_update_begin() for why no locking is required here. */
|
||||
const bool transparent = true; // TODO(sergey): Derive this from Film.
|
||||
|
||||
if (texture_.need_clear) {
|
||||
/* Texture is requested to be cleared and was not yet cleared.
|
||||
* Do early return which should be equivalent of drawing all-zero texture. */
|
||||
return;
|
||||
}
|
||||
|
||||
if (!gl_draw_resources_ensure()) {
|
||||
return;
|
||||
}
|
||||
@@ -499,6 +493,16 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams ¶ms)
|
||||
gl_context_mutex_.lock();
|
||||
}
|
||||
|
||||
if (texture_.need_clear) {
|
||||
/* Texture is requested to be cleared and was not yet cleared.
|
||||
*
|
||||
* Do early return which should be equivalent of drawing all-zero texture.
|
||||
* Watchout for the lock though so that the clear happening during update is properly
|
||||
* synchronized here. */
|
||||
gl_context_mutex_.unlock();
|
||||
return;
|
||||
}
|
||||
|
||||
if (gl_upload_sync_) {
|
||||
glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED);
|
||||
}
|
||||
@@ -524,7 +528,7 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams ¶ms)
|
||||
const float zoomed_width = params.size.x * zoom_.x;
|
||||
const float zoomed_height = params.size.y * zoom_.y;
|
||||
if (texture_.width != params.size.x || texture_.height != params.size.y) {
|
||||
/* Resolution divider is different from 1, force enarest interpolation. */
|
||||
/* Resolution divider is different from 1, force nearest interpolation. */
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
|
||||
}
|
||||
else if (zoomed_width - params.size.x > 0.5f || zoomed_height - params.size.y > 0.5f) {
|
||||
|
@@ -911,14 +911,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
|
||||
static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
|
||||
{
|
||||
vector<DeviceType> device_types = Device::available_types();
|
||||
bool has_cuda = false, has_optix = false;
|
||||
bool has_cuda = false, has_optix = false, has_hip = false;
|
||||
foreach (DeviceType device_type, device_types) {
|
||||
has_cuda |= (device_type == DEVICE_CUDA);
|
||||
has_optix |= (device_type == DEVICE_OPTIX);
|
||||
has_hip |= (device_type == DEVICE_HIP);
|
||||
}
|
||||
PyObject *list = PyTuple_New(2);
|
||||
PyObject *list = PyTuple_New(3);
|
||||
PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
|
||||
PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
|
||||
PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
|
||||
return list;
|
||||
}
|
||||
|
||||
@@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
|
||||
else if (override == "OPTIX") {
|
||||
BlenderSession::device_override = DEVICE_MASK_OPTIX;
|
||||
}
|
||||
else if (override == "HIP") {
|
||||
BlenderSession::device_override = DEVICE_MASK_HIP;
|
||||
}
|
||||
else {
|
||||
printf("\nError: %s is not a valid Cycles device.\n", override.c_str());
|
||||
Py_RETURN_FALSE;
|
||||
|
@@ -71,7 +71,8 @@ BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
|
||||
width(0),
|
||||
height(0),
|
||||
preview_osl(preview_osl),
|
||||
python_thread_state(NULL)
|
||||
python_thread_state(NULL),
|
||||
use_developer_ui(false)
|
||||
{
|
||||
/* offline render */
|
||||
background = true;
|
||||
@@ -311,6 +312,8 @@ void BlenderSession::read_render_tile()
|
||||
for (BL::RenderPass &b_pass : b_rlay.passes) {
|
||||
session->set_render_tile_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect());
|
||||
}
|
||||
|
||||
b_engine.end_result(b_rr, false, false, false);
|
||||
}
|
||||
|
||||
void BlenderSession::write_render_tile()
|
||||
@@ -557,6 +560,11 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
|
||||
/* free result without merging */
|
||||
b_engine.end_result(b_rr, true, false, false);
|
||||
|
||||
/* When tiled rendering is used there will be no "write" done for the tile. Forcefully clear
|
||||
* highlighted tiles now, so that the highlight will be removed while processing full frame from
|
||||
* file. */
|
||||
b_engine.tile_highlight_clear_all();
|
||||
|
||||
double total_time, render_time;
|
||||
session->progress.get_time(total_time, render_time);
|
||||
VLOG(1) << "Total render time: " << total_time;
|
||||
@@ -581,6 +589,12 @@ void BlenderSession::render_frame_finish()
|
||||
|
||||
for (string_view filename : full_buffer_files_) {
|
||||
session->process_full_buffer_from_disk(filename);
|
||||
if (check_and_report_session_error()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (string_view filename : full_buffer_files_) {
|
||||
path_remove(filename);
|
||||
}
|
||||
|
||||
@@ -988,8 +1002,9 @@ void BlenderSession::update_status_progress()
|
||||
get_status(status, substatus);
|
||||
get_progress(progress, total_time, render_time);
|
||||
|
||||
if (progress > 0)
|
||||
remaining_time = (1.0 - (double)progress) * (render_time / (double)progress);
|
||||
if (progress > 0) {
|
||||
remaining_time = session->get_estimated_remaining_time();
|
||||
}
|
||||
|
||||
if (background) {
|
||||
if (scene)
|
||||
@@ -1027,20 +1042,27 @@ void BlenderSession::update_status_progress()
|
||||
last_progress = progress;
|
||||
}
|
||||
|
||||
if (session->progress.get_error()) {
|
||||
string error = session->progress.get_error_message();
|
||||
check_and_report_session_error();
|
||||
}
|
||||
|
||||
bool BlenderSession::check_and_report_session_error()
|
||||
{
|
||||
if (!session->progress.get_error()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const string error = session->progress.get_error_message();
|
||||
if (error != last_error) {
|
||||
/* TODO(sergey): Currently C++ RNA API doesn't let us to
|
||||
* use mnemonic name for the variable. Would be nice to
|
||||
* have this figured out.
|
||||
/* TODO(sergey): Currently C++ RNA API doesn't let us to use mnemonic name for the variable.
|
||||
* Would be nice to have this figured out.
|
||||
*
|
||||
* For until then, 1 << 5 means RPT_ERROR.
|
||||
*/
|
||||
* For until then, 1 << 5 means RPT_ERROR. */
|
||||
b_engine.report(1 << 5, error.c_str());
|
||||
b_engine.error_set(error.c_str());
|
||||
last_error = error;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void BlenderSession::tag_update()
|
||||
|
@@ -110,8 +110,7 @@ class BlenderSession {
|
||||
BL::RenderSettings b_render;
|
||||
BL::Depsgraph b_depsgraph;
|
||||
/* NOTE: Blender's scene might become invalid after call
|
||||
* free_blender_memory_if_possible().
|
||||
*/
|
||||
* #free_blender_memory_if_possible(). */
|
||||
BL::Scene b_scene;
|
||||
BL::SpaceView3D b_v3d;
|
||||
BL::RegionView3D b_rv3d;
|
||||
@@ -147,6 +146,11 @@ class BlenderSession {
|
||||
protected:
|
||||
void stamp_view_layer_metadata(Scene *scene, const string &view_layer_name);
|
||||
|
||||
/* Check whether session error happened.
|
||||
* If so, it is reported to the render engine and true is returned.
|
||||
* Otherwise false is returned. */
|
||||
bool check_and_report_session_error();
|
||||
|
||||
void builtin_images_load();
|
||||
|
||||
/* Is used after each render layer synchronization is done with the goal
|
||||
|
@@ -855,7 +855,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,
|
||||
|
||||
if (background) {
|
||||
params.use_auto_tile = RNA_boolean_get(&cscene, "use_auto_tile");
|
||||
params.tile_size = get_int(cscene, "tile_size");
|
||||
params.tile_size = max(get_int(cscene, "tile_size"), 8);
|
||||
}
|
||||
else {
|
||||
params.use_auto_tile = false;
|
||||
|
@@ -90,27 +90,28 @@ static inline BL::Mesh object_to_mesh(BL::BlendData & /*data*/,
|
||||
}
|
||||
#endif
|
||||
|
||||
BL::Mesh mesh(PointerRNA_NULL);
|
||||
if (b_ob_info.object_data.is_a(&RNA_Mesh)) {
|
||||
/* TODO: calc_undeformed is not used. */
|
||||
mesh = BL::Mesh(b_ob_info.object_data);
|
||||
BL::Mesh mesh = (b_ob_info.object_data.is_a(&RNA_Mesh)) ? BL::Mesh(b_ob_info.object_data) :
|
||||
BL::Mesh(PointerRNA_NULL);
|
||||
|
||||
if (b_ob_info.is_real_object_data()) {
|
||||
if (mesh) {
|
||||
/* Make a copy to split faces if we use autosmooth, otherwise not needed.
|
||||
* Also in edit mode do we need to make a copy, to ensure data layers like
|
||||
* UV are not empty. */
|
||||
if (mesh.is_editmode() ||
|
||||
(mesh.use_auto_smooth() && subdivision_type == Mesh::SUBDIVISION_NONE)) {
|
||||
BL::Depsgraph depsgraph(PointerRNA_NULL);
|
||||
assert(b_ob_info.is_real_object_data());
|
||||
mesh = b_ob_info.real_object.to_mesh(false, depsgraph);
|
||||
}
|
||||
}
|
||||
else {
|
||||
BL::Depsgraph depsgraph(PointerRNA_NULL);
|
||||
if (b_ob_info.is_real_object_data()) {
|
||||
mesh = b_ob_info.real_object.to_mesh(false, depsgraph);
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* TODO: what to do about non-mesh geometry instances? */
|
||||
}
|
||||
|
||||
#if 0
|
||||
if (subdivision_type != Mesh::SUBDIVISION_NONE) {
|
||||
|
@@ -213,7 +213,7 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
|
||||
if (ctx->num_hits < ctx->max_hits) {
|
||||
Intersection current_isect;
|
||||
kernel_embree_convert_hit(kg, ray, hit, ¤t_isect);
|
||||
for (size_t i = 0; i < ctx->max_hits; ++i) {
|
||||
for (size_t i = 0; i < ctx->num_hits; ++i) {
|
||||
if (current_isect.object == ctx->isect_s[i].object &&
|
||||
current_isect.prim == ctx->isect_s[i].prim && current_isect.t == ctx->isect_s[i].t) {
|
||||
/* This intersection was already recorded, skip it. */
|
||||
|
@@ -532,4 +532,13 @@ if(WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
###########################################################################
|
||||
# HIP
|
||||
###########################################################################
|
||||
|
||||
if(NOT WITH_HIP_DYNLOAD)
|
||||
set(WITH_HIP_DYNLOAD ON)
|
||||
endif()
|
||||
|
||||
unset(_cycles_lib_dir)
|
||||
|
@@ -156,11 +156,17 @@ macro(cycles_target_link_libraries target)
|
||||
${PLATFORM_LINKLIBS}
|
||||
)
|
||||
|
||||
if(WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX)
|
||||
if(WITH_CUDA_DYNLOAD)
|
||||
target_link_libraries(${target} extern_cuew)
|
||||
else()
|
||||
target_link_libraries(${target} ${CUDA_CUDA_LIBRARY})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
|
||||
target_link_libraries(${target} extern_hipew)
|
||||
endif()
|
||||
|
||||
if(CYCLES_STANDALONE_REPOSITORY)
|
||||
target_link_libraries(${target} extern_numaapi)
|
||||
|
@@ -22,6 +22,7 @@ set(INC_SYS
|
||||
../../../extern/clew/include
|
||||
)
|
||||
|
||||
if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
|
||||
if(WITH_CUDA_DYNLOAD)
|
||||
list(APPEND INC
|
||||
../../../extern/cuew/include
|
||||
@@ -33,6 +34,14 @@ else()
|
||||
)
|
||||
add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
|
||||
list(APPEND INC
|
||||
../../../extern/hipew/include
|
||||
)
|
||||
add_definitions(-DWITH_HIP_DYNLOAD)
|
||||
endif()
|
||||
|
||||
set(SRC
|
||||
device.cpp
|
||||
@@ -70,6 +79,21 @@ set(SRC_CUDA
|
||||
cuda/util.h
|
||||
)
|
||||
|
||||
set(SRC_HIP
|
||||
hip/device.cpp
|
||||
hip/device.h
|
||||
hip/device_impl.cpp
|
||||
hip/device_impl.h
|
||||
hip/graphics_interop.cpp
|
||||
hip/graphics_interop.h
|
||||
hip/kernel.cpp
|
||||
hip/kernel.h
|
||||
hip/queue.cpp
|
||||
hip/queue.h
|
||||
hip/util.cpp
|
||||
hip/util.h
|
||||
)
|
||||
|
||||
set(SRC_DUMMY
|
||||
dummy/device.cpp
|
||||
dummy/device.h
|
||||
@@ -105,6 +129,7 @@ set(LIB
|
||||
${CYCLES_GL_LIBRARIES}
|
||||
)
|
||||
|
||||
if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
|
||||
if(WITH_CUDA_DYNLOAD)
|
||||
list(APPEND LIB
|
||||
extern_cuew
|
||||
@@ -114,12 +139,22 @@ else()
|
||||
${CUDA_CUDA_LIBRARY}
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
|
||||
list(APPEND LIB
|
||||
extern_hipew
|
||||
)
|
||||
endif()
|
||||
|
||||
add_definitions(${GL_DEFINITIONS})
|
||||
|
||||
if(WITH_CYCLES_DEVICE_CUDA)
|
||||
add_definitions(-DWITH_CUDA)
|
||||
endif()
|
||||
if(WITH_CYCLES_DEVICE_HIP)
|
||||
add_definitions(-DWITH_HIP)
|
||||
endif()
|
||||
if(WITH_CYCLES_DEVICE_OPTIX)
|
||||
add_definitions(-DWITH_OPTIX)
|
||||
endif()
|
||||
@@ -140,6 +175,7 @@ cycles_add_library(cycles_device "${LIB}"
|
||||
${SRC}
|
||||
${SRC_CPU}
|
||||
${SRC_CUDA}
|
||||
${SRC_HIP}
|
||||
${SRC_DUMMY}
|
||||
${SRC_MULTI}
|
||||
${SRC_OPTIX}
|
||||
|
@@ -54,7 +54,6 @@
|
||||
#include "util/util_function.h"
|
||||
#include "util/util_logging.h"
|
||||
#include "util/util_map.h"
|
||||
#include "util/util_opengl.h"
|
||||
#include "util/util_openimagedenoise.h"
|
||||
#include "util/util_optimization.h"
|
||||
#include "util/util_progress.h"
|
||||
@@ -170,7 +169,7 @@ void CPUDevice::mem_copy_to(device_memory &mem)
|
||||
}
|
||||
|
||||
void CPUDevice::mem_copy_from(
|
||||
device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
|
||||
device_memory & /*mem*/, size_t /*y*/, size_t /*w*/, size_t /*h*/, size_t /*elem*/)
|
||||
{
|
||||
/* no-op */
|
||||
}
|
||||
@@ -204,7 +203,7 @@ void CPUDevice::mem_free(device_memory &mem)
|
||||
}
|
||||
}
|
||||
|
||||
device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
|
||||
device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
|
||||
{
|
||||
return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
|
||||
}
|
||||
@@ -298,154 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
||||
Device::build_bvh(bvh, progress, refit);
|
||||
}
|
||||
|
||||
#if 0
|
||||
void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
|
||||
{
|
||||
const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
|
||||
|
||||
scoped_timer timer(&tile.buffers->render_time);
|
||||
|
||||
Coverage coverage(kg, tile);
|
||||
if (use_coverage) {
|
||||
coverage.init_path_trace();
|
||||
}
|
||||
|
||||
float *render_buffer = (float *)tile.buffer;
|
||||
int start_sample = tile.start_sample;
|
||||
int end_sample = tile.start_sample + tile.num_samples;
|
||||
|
||||
/* Needed for Embree. */
|
||||
SIMD_SET_FLUSH_TO_ZERO;
|
||||
|
||||
for (int sample = start_sample; sample < end_sample; sample++) {
|
||||
if (task.get_cancel() || TaskPool::canceled()) {
|
||||
if (task.need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
|
||||
if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
|
||||
tile.stealing_state = RenderTile::WAS_STOLEN;
|
||||
break;
|
||||
}
|
||||
|
||||
if (tile.task == RenderTile::PATH_TRACE) {
|
||||
for (int y = tile.y; y < tile.y + tile.h; y++) {
|
||||
for (int x = tile.x; x < tile.x + tile.w; x++) {
|
||||
if (use_coverage) {
|
||||
coverage.init_pixel(x, y);
|
||||
}
|
||||
kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int y = tile.y; y < tile.y + tile.h; y++) {
|
||||
for (int x = tile.x; x < tile.x + tile.w; x++) {
|
||||
kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
tile.sample = sample + 1;
|
||||
|
||||
if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
|
||||
const bool stop = adaptive_sampling_filter(kg, tile, sample);
|
||||
if (stop) {
|
||||
const int num_progress_samples = end_sample - sample;
|
||||
tile.sample = end_sample;
|
||||
task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
task.update_progress(&tile, tile.w * tile.h);
|
||||
}
|
||||
if (use_coverage) {
|
||||
coverage.finalize();
|
||||
}
|
||||
|
||||
if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
|
||||
adaptive_sampling_post(tile, kg);
|
||||
}
|
||||
}
|
||||
|
||||
void CPUDevice::thread_render(DeviceTask &task)
|
||||
{
|
||||
if (TaskPool::canceled()) {
|
||||
if (task.need_finish_queue == false)
|
||||
return;
|
||||
}
|
||||
|
||||
/* allocate buffer for kernel globals */
|
||||
CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory());
|
||||
|
||||
profiler.add_state(&kg.profiler);
|
||||
|
||||
/* NLM denoiser. */
|
||||
DenoisingTask *denoising = NULL;
|
||||
|
||||
/* OpenImageDenoise: we can only denoise with one thread at a time, so to
|
||||
* avoid waiting with mutex locks in the denoiser, we let only a single
|
||||
* thread acquire denoising tiles. */
|
||||
uint tile_types = task.tile_types;
|
||||
bool hold_denoise_lock = false;
|
||||
if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
|
||||
if (!oidn_task_lock.try_lock()) {
|
||||
tile_types &= ~RenderTile::DENOISE;
|
||||
hold_denoise_lock = true;
|
||||
}
|
||||
}
|
||||
|
||||
RenderTile tile;
|
||||
while (task.acquire_tile(this, tile, tile_types)) {
|
||||
if (tile.task == RenderTile::PATH_TRACE) {
|
||||
render(task, tile, &kg);
|
||||
}
|
||||
else if (tile.task == RenderTile::BAKE) {
|
||||
render(task, tile, &kg);
|
||||
}
|
||||
else if (tile.task == RenderTile::DENOISE) {
|
||||
denoise_openimagedenoise(task, tile);
|
||||
task.update_progress(&tile, tile.w * tile.h);
|
||||
}
|
||||
|
||||
task.release_tile(tile);
|
||||
|
||||
if (TaskPool::canceled()) {
|
||||
if (task.need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (hold_denoise_lock) {
|
||||
oidn_task_lock.unlock();
|
||||
}
|
||||
|
||||
profiler.remove_state(&kg.profiler);
|
||||
|
||||
delete denoising;
|
||||
}
|
||||
|
||||
void CPUDevice::thread_denoise(DeviceTask &task)
|
||||
{
|
||||
RenderTile tile;
|
||||
tile.x = task.x;
|
||||
tile.y = task.y;
|
||||
tile.w = task.w;
|
||||
tile.h = task.h;
|
||||
tile.buffer = task.buffer;
|
||||
tile.sample = task.sample + task.num_samples;
|
||||
tile.num_samples = task.num_samples;
|
||||
tile.start_sample = task.sample;
|
||||
tile.offset = task.offset;
|
||||
tile.stride = task.stride;
|
||||
tile.buffers = task.buffers;
|
||||
|
||||
denoise_openimagedenoise(task, tile);
|
||||
|
||||
task.update_progress(&tile, tile.w * tile.h);
|
||||
}
|
||||
#endif
|
||||
|
||||
const CPUKernels *CPUDevice::get_cpu_kernels() const
|
||||
{
|
||||
return &kernels;
|
||||
|
@@ -72,10 +72,13 @@ class CPUDevice : public Device {
|
||||
|
||||
virtual void mem_alloc(device_memory &mem) override;
|
||||
virtual void mem_copy_to(device_memory &mem) override;
|
||||
virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
|
||||
virtual void mem_copy_from(
|
||||
device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
|
||||
virtual void mem_zero(device_memory &mem) override;
|
||||
virtual void mem_free(device_memory &mem) override;
|
||||
virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
|
||||
virtual device_ptr mem_alloc_sub_ptr(device_memory &mem,
|
||||
size_t offset,
|
||||
size_t /*size*/) override;
|
||||
|
||||
virtual void const_copy_to(const char *name, void *host, size_t size) override;
|
||||
|
||||
|
@@ -31,7 +31,6 @@
|
||||
# include "util/util_logging.h"
|
||||
# include "util/util_map.h"
|
||||
# include "util/util_md5.h"
|
||||
# include "util/util_opengl.h"
|
||||
# include "util/util_path.h"
|
||||
# include "util/util_string.h"
|
||||
# include "util/util_system.h"
|
||||
@@ -837,7 +836,7 @@ void CUDADevice::mem_copy_to(device_memory &mem)
|
||||
}
|
||||
}
|
||||
|
||||
void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
|
||||
void CUDADevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
|
||||
{
|
||||
if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
|
||||
assert(!"mem_copy_from not supported for textures.");
|
||||
@@ -891,7 +890,7 @@ void CUDADevice::mem_free(device_memory &mem)
|
||||
}
|
||||
}
|
||||
|
||||
device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
|
||||
device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
|
||||
{
|
||||
return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
|
||||
}
|
||||
@@ -1169,141 +1168,6 @@ void CUDADevice::tex_free(device_texture &mem)
|
||||
}
|
||||
}
|
||||
|
||||
# if 0
|
||||
void CUDADevice::render(DeviceTask &task,
|
||||
RenderTile &rtile,
|
||||
device_vector<KernelWorkTile> &work_tiles)
|
||||
{
|
||||
scoped_timer timer(&rtile.buffers->render_time);
|
||||
|
||||
if (have_error())
|
||||
return;
|
||||
|
||||
CUDAContextScope scope(this);
|
||||
CUfunction cuRender;
|
||||
|
||||
/* Get kernel function. */
|
||||
if (rtile.task == RenderTile::BAKE) {
|
||||
cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
|
||||
}
|
||||
else {
|
||||
cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
|
||||
}
|
||||
|
||||
if (have_error()) {
|
||||
return;
|
||||
}
|
||||
|
||||
cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
|
||||
|
||||
/* Allocate work tile. */
|
||||
work_tiles.alloc(1);
|
||||
|
||||
KernelWorkTile *wtile = work_tiles.data();
|
||||
wtile->x = rtile.x;
|
||||
wtile->y = rtile.y;
|
||||
wtile->w = rtile.w;
|
||||
wtile->h = rtile.h;
|
||||
wtile->offset = rtile.offset;
|
||||
wtile->stride = rtile.stride;
|
||||
wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
|
||||
|
||||
/* Prepare work size. More step samples render faster, but for now we
|
||||
* remain conservative for GPUs connected to a display to avoid driver
|
||||
* timeouts and display freezing. */
|
||||
int min_blocks, num_threads_per_block;
|
||||
cuda_assert(
|
||||
cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
|
||||
if (!info.display_device) {
|
||||
min_blocks *= 8;
|
||||
}
|
||||
|
||||
uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
|
||||
|
||||
/* Render all samples. */
|
||||
uint start_sample = rtile.start_sample;
|
||||
uint end_sample = rtile.start_sample + rtile.num_samples;
|
||||
|
||||
for (int sample = start_sample; sample < end_sample;) {
|
||||
/* Setup and copy work tile to device. */
|
||||
wtile->start_sample = sample;
|
||||
wtile->num_samples = step_samples;
|
||||
if (task.adaptive_sampling.use) {
|
||||
wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
|
||||
}
|
||||
wtile->num_samples = min(wtile->num_samples, end_sample - sample);
|
||||
work_tiles.copy_to_device();
|
||||
|
||||
CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
|
||||
uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
|
||||
uint num_blocks = divide_up(total_work_size, num_threads_per_block);
|
||||
|
||||
/* Launch kernel. */
|
||||
void *args[] = {&d_work_tiles, &total_work_size};
|
||||
|
||||
cuda_assert(
|
||||
cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
|
||||
|
||||
/* Run the adaptive sampling kernels at selected samples aligned to step samples. */
|
||||
uint filter_sample = sample + wtile->num_samples - 1;
|
||||
if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
|
||||
adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
|
||||
}
|
||||
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
|
||||
/* Update progress. */
|
||||
sample += wtile->num_samples;
|
||||
rtile.sample = sample;
|
||||
task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
|
||||
|
||||
if (task.get_cancel()) {
|
||||
if (task.need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Finalize adaptive sampling. */
|
||||
if (task.adaptive_sampling.use) {
|
||||
CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
|
||||
adaptive_sampling_post(rtile, wtile, d_work_tiles);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
|
||||
}
|
||||
}
|
||||
|
||||
void CUDADevice::thread_run(DeviceTask &task)
|
||||
{
|
||||
CUDAContextScope scope(this);
|
||||
|
||||
if (task.type == DeviceTask::RENDER) {
|
||||
device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
|
||||
|
||||
/* keep rendering tiles until done */
|
||||
RenderTile tile;
|
||||
DenoisingTask denoising(this, task);
|
||||
|
||||
while (task.acquire_tile(this, tile, task.tile_types)) {
|
||||
if (tile.task == RenderTile::PATH_TRACE) {
|
||||
render(task, tile, work_tiles);
|
||||
}
|
||||
else if (tile.task == RenderTile::BAKE) {
|
||||
render(task, tile, work_tiles);
|
||||
}
|
||||
|
||||
task.release_tile(tile);
|
||||
|
||||
if (task.get_cancel()) {
|
||||
if (task.need_finish_queue == false)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
work_tiles.free();
|
||||
}
|
||||
}
|
||||
# endif
|
||||
|
||||
unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
|
||||
{
|
||||
return make_unique<CUDADeviceQueue>(this);
|
||||
|
@@ -26,7 +26,6 @@
|
||||
# ifdef WITH_CUDA_DYNLOAD
|
||||
# include "cuew.h"
|
||||
# else
|
||||
# include "util/util_opengl.h"
|
||||
# include <cuda.h>
|
||||
# include <cudaGL.h>
|
||||
# endif
|
||||
@@ -120,13 +119,13 @@ class CUDADevice : public Device {
|
||||
|
||||
void mem_copy_to(device_memory &mem) override;
|
||||
|
||||
void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
|
||||
void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
|
||||
|
||||
void mem_zero(device_memory &mem) override;
|
||||
|
||||
void mem_free(device_memory &mem) override;
|
||||
|
||||
device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
|
||||
device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
|
||||
|
||||
virtual void const_copy_to(const char *name, void *host, size_t size) override;
|
||||
|
||||
|
@@ -116,8 +116,7 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
|
||||
}
|
||||
|
||||
/* Launch kernel. */
|
||||
cuda_device_assert(cuda_device_,
|
||||
cuLaunchKernel(cuda_kernel.function,
|
||||
assert_success(cuLaunchKernel(cuda_kernel.function,
|
||||
num_blocks,
|
||||
1,
|
||||
1,
|
||||
@@ -127,7 +126,8 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
|
||||
shared_mem_bytes,
|
||||
cuda_stream_,
|
||||
args,
|
||||
0));
|
||||
0),
|
||||
"enqueue");
|
||||
|
||||
return !(cuda_device_->have_error());
|
||||
}
|
||||
@@ -139,7 +139,8 @@ bool CUDADeviceQueue::synchronize()
|
||||
}
|
||||
|
||||
const CUDAContextScope scope(cuda_device_);
|
||||
cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
|
||||
assert_success(cuStreamSynchronize(cuda_stream_), "synchronize");
|
||||
|
||||
debug_synchronize();
|
||||
|
||||
return !(cuda_device_->have_error());
|
||||
@@ -162,9 +163,9 @@ void CUDADeviceQueue::zero_to_device(device_memory &mem)
|
||||
assert(mem.device_pointer != 0);
|
||||
|
||||
const CUDAContextScope scope(cuda_device_);
|
||||
cuda_device_assert(
|
||||
cuda_device_,
|
||||
cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
|
||||
assert_success(
|
||||
cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_),
|
||||
"zero_to_device");
|
||||
}
|
||||
|
||||
void CUDADeviceQueue::copy_to_device(device_memory &mem)
|
||||
@@ -185,10 +186,10 @@ void CUDADeviceQueue::copy_to_device(device_memory &mem)
|
||||
|
||||
/* Copy memory to device. */
|
||||
const CUDAContextScope scope(cuda_device_);
|
||||
cuda_device_assert(
|
||||
cuda_device_,
|
||||
assert_success(
|
||||
cuMemcpyHtoDAsync(
|
||||
(CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
|
||||
(CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_),
|
||||
"copy_to_device");
|
||||
}
|
||||
|
||||
void CUDADeviceQueue::copy_from_device(device_memory &mem)
|
||||
@@ -204,10 +205,19 @@ void CUDADeviceQueue::copy_from_device(device_memory &mem)
|
||||
|
||||
/* Copy memory from device. */
|
||||
const CUDAContextScope scope(cuda_device_);
|
||||
cuda_device_assert(
|
||||
cuda_device_,
|
||||
assert_success(
|
||||
cuMemcpyDtoHAsync(
|
||||
mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
|
||||
mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_),
|
||||
"copy_from_device");
|
||||
}
|
||||
|
||||
void CUDADeviceQueue::assert_success(CUresult result, const char *operation)
|
||||
{
|
||||
if (result != CUDA_SUCCESS) {
|
||||
const char *name = cuewErrorString(result);
|
||||
cuda_device_->set_error(string_printf(
|
||||
"%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
|
||||
|
@@ -60,6 +60,8 @@ class CUDADeviceQueue : public DeviceQueue {
|
||||
protected:
|
||||
CUDADevice *cuda_device_;
|
||||
CUstream cuda_stream_;
|
||||
|
||||
void assert_success(CUresult result, const char *operation);
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@@ -25,6 +25,7 @@
|
||||
#include "device/cpu/device.h"
|
||||
#include "device/cuda/device.h"
|
||||
#include "device/dummy/device.h"
|
||||
#include "device/hip/device.h"
|
||||
#include "device/multi/device.h"
|
||||
#include "device/optix/device.h"
|
||||
|
||||
@@ -32,7 +33,6 @@
|
||||
#include "util/util_half.h"
|
||||
#include "util/util_logging.h"
|
||||
#include "util/util_math.h"
|
||||
#include "util/util_opengl.h"
|
||||
#include "util/util_string.h"
|
||||
#include "util/util_system.h"
|
||||
#include "util/util_time.h"
|
||||
@@ -47,6 +47,7 @@ thread_mutex Device::device_mutex;
|
||||
vector<DeviceInfo> Device::cuda_devices;
|
||||
vector<DeviceInfo> Device::optix_devices;
|
||||
vector<DeviceInfo> Device::cpu_devices;
|
||||
vector<DeviceInfo> Device::hip_devices;
|
||||
uint Device::devices_initialized_mask = 0;
|
||||
|
||||
/* Device */
|
||||
@@ -97,6 +98,14 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
|
||||
device = device_optix_create(info, stats, profiler);
|
||||
break;
|
||||
#endif
|
||||
|
||||
#ifdef WITH_HIP
|
||||
case DEVICE_HIP:
|
||||
if (device_hip_init())
|
||||
device = device_hip_create(info, stats, profiler);
|
||||
break;
|
||||
#endif
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -118,6 +127,8 @@ DeviceType Device::type_from_string(const char *name)
|
||||
return DEVICE_OPTIX;
|
||||
else if (strcmp(name, "MULTI") == 0)
|
||||
return DEVICE_MULTI;
|
||||
else if (strcmp(name, "HIP") == 0)
|
||||
return DEVICE_HIP;
|
||||
|
||||
return DEVICE_NONE;
|
||||
}
|
||||
@@ -132,6 +143,8 @@ string Device::string_from_type(DeviceType type)
|
||||
return "OPTIX";
|
||||
else if (type == DEVICE_MULTI)
|
||||
return "MULTI";
|
||||
else if (type == DEVICE_HIP)
|
||||
return "HIP";
|
||||
|
||||
return "";
|
||||
}
|
||||
@@ -146,6 +159,10 @@ vector<DeviceType> Device::available_types()
|
||||
#ifdef WITH_OPTIX
|
||||
types.push_back(DEVICE_OPTIX);
|
||||
#endif
|
||||
#ifdef WITH_HIP
|
||||
types.push_back(DEVICE_HIP);
|
||||
#endif
|
||||
|
||||
return types;
|
||||
}
|
||||
|
||||
@@ -187,6 +204,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WITH_HIP
|
||||
if (mask & DEVICE_MASK_HIP) {
|
||||
if (!(devices_initialized_mask & DEVICE_MASK_HIP)) {
|
||||
if (device_hip_init()) {
|
||||
device_hip_info(hip_devices);
|
||||
}
|
||||
devices_initialized_mask |= DEVICE_MASK_HIP;
|
||||
}
|
||||
foreach (DeviceInfo &info, hip_devices) {
|
||||
devices.push_back(info);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (mask & DEVICE_MASK_CPU) {
|
||||
if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
|
||||
device_cpu_info(cpu_devices);
|
||||
@@ -227,6 +258,15 @@ string Device::device_capabilities(uint mask)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef WITH_HIP
|
||||
if (mask & DEVICE_MASK_HIP) {
|
||||
if (device_hip_init()) {
|
||||
capabilities += "\nHIP device capabilities:\n";
|
||||
capabilities += device_hip_capabilities();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return capabilities;
|
||||
}
|
||||
|
||||
@@ -315,6 +355,7 @@ void Device::free_memory()
|
||||
devices_initialized_mask = 0;
|
||||
cuda_devices.free_memory();
|
||||
optix_devices.free_memory();
|
||||
hip_devices.free_memory();
|
||||
cpu_devices.free_memory();
|
||||
}
|
||||
|
||||
|
@@ -51,6 +51,7 @@ enum DeviceType {
|
||||
DEVICE_CUDA,
|
||||
DEVICE_MULTI,
|
||||
DEVICE_OPTIX,
|
||||
DEVICE_HIP,
|
||||
DEVICE_DUMMY,
|
||||
};
|
||||
|
||||
@@ -58,6 +59,7 @@ enum DeviceTypeMask {
|
||||
DEVICE_MASK_CPU = (1 << DEVICE_CPU),
|
||||
DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
|
||||
DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
|
||||
DEVICE_MASK_HIP = (1 << DEVICE_HIP),
|
||||
DEVICE_MASK_ALL = ~0
|
||||
};
|
||||
|
||||
@@ -119,7 +121,7 @@ class Device {
|
||||
|
||||
string error_msg;
|
||||
|
||||
virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
|
||||
virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, size_t /*offset*/, size_t /*size*/)
|
||||
{
|
||||
/* Only required for devices that implement denoising. */
|
||||
assert(false);
|
||||
@@ -273,7 +275,7 @@ class Device {
|
||||
|
||||
virtual void mem_alloc(device_memory &mem) = 0;
|
||||
virtual void mem_copy_to(device_memory &mem) = 0;
|
||||
virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) = 0;
|
||||
virtual void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) = 0;
|
||||
virtual void mem_zero(device_memory &mem) = 0;
|
||||
virtual void mem_free(device_memory &mem) = 0;
|
||||
|
||||
@@ -284,6 +286,7 @@ class Device {
|
||||
static vector<DeviceInfo> cuda_devices;
|
||||
static vector<DeviceInfo> optix_devices;
|
||||
static vector<DeviceInfo> cpu_devices;
|
||||
static vector<DeviceInfo> hip_devices;
|
||||
static uint devices_initialized_mask;
|
||||
};
|
||||
|
||||
|
@@ -136,7 +136,7 @@ void device_memory::device_copy_to()
|
||||
}
|
||||
}
|
||||
|
||||
void device_memory::device_copy_from(int y, int w, int h, int elem)
|
||||
void device_memory::device_copy_from(size_t y, size_t w, size_t h, size_t elem)
|
||||
{
|
||||
assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL);
|
||||
device->mem_copy_from(*this, y, w, h, elem);
|
||||
@@ -181,7 +181,7 @@ bool device_memory::is_resident(Device *sub_device) const
|
||||
|
||||
/* Device Sub Ptr */
|
||||
|
||||
device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device)
|
||||
device_sub_ptr::device_sub_ptr(device_memory &mem, size_t offset, size_t size) : device(mem.device)
|
||||
{
|
||||
ptr = device->mem_alloc_sub_ptr(mem, offset, size);
|
||||
}
|
||||
|
@@ -81,154 +81,154 @@ static constexpr size_t datatype_size(DataType datatype)
|
||||
|
||||
template<typename T> struct device_type_traits {
|
||||
static const DataType data_type = TYPE_UNKNOWN;
|
||||
static const int num_elements_cpu = sizeof(T);
|
||||
static const int num_elements_gpu = sizeof(T);
|
||||
static const size_t num_elements_cpu = sizeof(T);
|
||||
static const size_t num_elements_gpu = sizeof(T);
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uchar> {
|
||||
static const DataType data_type = TYPE_UCHAR;
|
||||
static const int num_elements_cpu = 1;
|
||||
static const int num_elements_gpu = 1;
|
||||
static const size_t num_elements_cpu = 1;
|
||||
static const size_t num_elements_gpu = 1;
|
||||
static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uchar2> {
|
||||
static const DataType data_type = TYPE_UCHAR;
|
||||
static const int num_elements_cpu = 2;
|
||||
static const int num_elements_gpu = 2;
|
||||
static const size_t num_elements_cpu = 2;
|
||||
static const size_t num_elements_gpu = 2;
|
||||
static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uchar3> {
|
||||
static const DataType data_type = TYPE_UCHAR;
|
||||
static const int num_elements_cpu = 3;
|
||||
static const int num_elements_gpu = 3;
|
||||
static const size_t num_elements_cpu = 3;
|
||||
static const size_t num_elements_gpu = 3;
|
||||
static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uchar4> {
|
||||
static const DataType data_type = TYPE_UCHAR;
|
||||
static const int num_elements_cpu = 4;
|
||||
static const int num_elements_gpu = 4;
|
||||
static const size_t num_elements_cpu = 4;
|
||||
static const size_t num_elements_gpu = 4;
|
||||
static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uint> {
|
||||
static const DataType data_type = TYPE_UINT;
|
||||
static const int num_elements_cpu = 1;
|
||||
static const int num_elements_gpu = 1;
|
||||
static const size_t num_elements_cpu = 1;
|
||||
static const size_t num_elements_gpu = 1;
|
||||
static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uint2> {
|
||||
static const DataType data_type = TYPE_UINT;
|
||||
static const int num_elements_cpu = 2;
|
||||
static const int num_elements_gpu = 2;
|
||||
static const size_t num_elements_cpu = 2;
|
||||
static const size_t num_elements_gpu = 2;
|
||||
static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uint3> {
|
||||
static const DataType data_type = TYPE_UINT;
|
||||
static const int num_elements_cpu = 3;
|
||||
static const int num_elements_gpu = 3;
|
||||
static const size_t num_elements_cpu = 3;
|
||||
static const size_t num_elements_gpu = 3;
|
||||
static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uint4> {
|
||||
static const DataType data_type = TYPE_UINT;
|
||||
static const int num_elements_cpu = 4;
|
||||
static const int num_elements_gpu = 4;
|
||||
static const size_t num_elements_cpu = 4;
|
||||
static const size_t num_elements_gpu = 4;
|
||||
static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<int> {
|
||||
static const DataType data_type = TYPE_INT;
|
||||
static const int num_elements_cpu = 1;
|
||||
static const int num_elements_gpu = 1;
|
||||
static const size_t num_elements_cpu = 1;
|
||||
static const size_t num_elements_gpu = 1;
|
||||
static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<int2> {
|
||||
static const DataType data_type = TYPE_INT;
|
||||
static const int num_elements_cpu = 2;
|
||||
static const int num_elements_gpu = 2;
|
||||
static const size_t num_elements_cpu = 2;
|
||||
static const size_t num_elements_gpu = 2;
|
||||
static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<int3> {
|
||||
static const DataType data_type = TYPE_INT;
|
||||
static const int num_elements_cpu = 4;
|
||||
static const int num_elements_gpu = 3;
|
||||
static const size_t num_elements_cpu = 4;
|
||||
static const size_t num_elements_gpu = 3;
|
||||
static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<int4> {
|
||||
static const DataType data_type = TYPE_INT;
|
||||
static const int num_elements_cpu = 4;
|
||||
static const int num_elements_gpu = 4;
|
||||
static const size_t num_elements_cpu = 4;
|
||||
static const size_t num_elements_gpu = 4;
|
||||
static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<float> {
|
||||
static const DataType data_type = TYPE_FLOAT;
|
||||
static const int num_elements_cpu = 1;
|
||||
static const int num_elements_gpu = 1;
|
||||
static const size_t num_elements_cpu = 1;
|
||||
static const size_t num_elements_gpu = 1;
|
||||
static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<float2> {
|
||||
static const DataType data_type = TYPE_FLOAT;
|
||||
static const int num_elements_cpu = 2;
|
||||
static const int num_elements_gpu = 2;
|
||||
static const size_t num_elements_cpu = 2;
|
||||
static const size_t num_elements_gpu = 2;
|
||||
static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<float3> {
|
||||
static const DataType data_type = TYPE_FLOAT;
|
||||
static const int num_elements_cpu = 4;
|
||||
static const int num_elements_gpu = 3;
|
||||
static const size_t num_elements_cpu = 4;
|
||||
static const size_t num_elements_gpu = 3;
|
||||
static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<float4> {
|
||||
static const DataType data_type = TYPE_FLOAT;
|
||||
static const int num_elements_cpu = 4;
|
||||
static const int num_elements_gpu = 4;
|
||||
static const size_t num_elements_cpu = 4;
|
||||
static const size_t num_elements_gpu = 4;
|
||||
static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<half> {
|
||||
static const DataType data_type = TYPE_HALF;
|
||||
static const int num_elements_cpu = 1;
|
||||
static const int num_elements_gpu = 1;
|
||||
static const size_t num_elements_cpu = 1;
|
||||
static const size_t num_elements_gpu = 1;
|
||||
static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<ushort4> {
|
||||
static const DataType data_type = TYPE_UINT16;
|
||||
static const int num_elements_cpu = 4;
|
||||
static const int num_elements_gpu = 4;
|
||||
static const size_t num_elements_cpu = 4;
|
||||
static const size_t num_elements_gpu = 4;
|
||||
static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uint16_t> {
|
||||
static const DataType data_type = TYPE_UINT16;
|
||||
static const int num_elements_cpu = 1;
|
||||
static const int num_elements_gpu = 1;
|
||||
static const size_t num_elements_cpu = 1;
|
||||
static const size_t num_elements_gpu = 1;
|
||||
static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<half4> {
|
||||
static const DataType data_type = TYPE_HALF;
|
||||
static const int num_elements_cpu = 4;
|
||||
static const int num_elements_gpu = 4;
|
||||
static const size_t num_elements_cpu = 4;
|
||||
static const size_t num_elements_gpu = 4;
|
||||
static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
template<> struct device_type_traits<uint64_t> {
|
||||
static const DataType data_type = TYPE_UINT64;
|
||||
static const int num_elements_cpu = 1;
|
||||
static const int num_elements_gpu = 1;
|
||||
static const size_t num_elements_cpu = 1;
|
||||
static const size_t num_elements_gpu = 1;
|
||||
static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
|
||||
};
|
||||
|
||||
@@ -277,6 +277,7 @@ class device_memory {
|
||||
protected:
|
||||
friend class CUDADevice;
|
||||
friend class OptiXDevice;
|
||||
friend class HIPDevice;
|
||||
|
||||
/* Only create through subclasses. */
|
||||
device_memory(Device *device, const char *name, MemoryType type);
|
||||
@@ -296,7 +297,7 @@ class device_memory {
|
||||
void device_alloc();
|
||||
void device_free();
|
||||
void device_copy_to();
|
||||
void device_copy_from(int y, int w, int h, int elem);
|
||||
void device_copy_from(size_t y, size_t w, size_t h, size_t elem);
|
||||
void device_zero();
|
||||
|
||||
bool device_is_cpu();
|
||||
@@ -565,7 +566,7 @@ template<typename T> class device_vector : public device_memory {
|
||||
device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T));
|
||||
}
|
||||
|
||||
void copy_from_device(int y, int w, int h)
|
||||
void copy_from_device(size_t y, size_t w, size_t h)
|
||||
{
|
||||
device_copy_from(y, w, h, sizeof(T));
|
||||
}
|
||||
@@ -601,7 +602,7 @@ template<typename T> class device_vector : public device_memory {
|
||||
|
||||
class device_sub_ptr {
|
||||
public:
|
||||
device_sub_ptr(device_memory &mem, int offset, int size);
|
||||
device_sub_ptr(device_memory &mem, size_t offset, size_t size);
|
||||
~device_sub_ptr();
|
||||
|
||||
device_ptr operator*() const
|
||||
|
@@ -57,8 +57,9 @@ void DeviceQueue::debug_init_execution()
|
||||
{
|
||||
if (VLOG_IS_ON(3)) {
|
||||
last_sync_time_ = time_dt();
|
||||
last_kernels_enqueued_ = 0;
|
||||
}
|
||||
|
||||
last_kernels_enqueued_ = 0;
|
||||
}
|
||||
|
||||
void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
|
||||
@@ -66,8 +67,9 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
|
||||
if (VLOG_IS_ON(3)) {
|
||||
VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
|
||||
<< work_size;
|
||||
last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
|
||||
}
|
||||
|
||||
last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
|
||||
}
|
||||
|
||||
void DeviceQueue::debug_synchronize()
|
||||
@@ -80,8 +82,14 @@ void DeviceQueue::debug_synchronize()
|
||||
stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;
|
||||
|
||||
last_sync_time_ = new_time;
|
||||
}
|
||||
|
||||
last_kernels_enqueued_ = 0;
|
||||
}
|
||||
|
||||
string DeviceQueue::debug_active_kernels()
|
||||
{
|
||||
return device_kernel_mask_as_string(last_kernels_enqueued_);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@@ -21,6 +21,7 @@
|
||||
#include "device/device_graphics_interop.h"
|
||||
#include "util/util_logging.h"
|
||||
#include "util/util_map.h"
|
||||
#include "util/util_string.h"
|
||||
#include "util/util_unique_ptr.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
@@ -101,6 +102,7 @@ class DeviceQueue {
|
||||
void debug_init_execution();
|
||||
void debug_enqueue(DeviceKernel kernel, const int work_size);
|
||||
void debug_synchronize();
|
||||
string debug_active_kernels();
|
||||
|
||||
/* Combination of kernels enqueued together sync last synchronize. */
|
||||
DeviceKernelMask last_kernels_enqueued_;
|
||||
|
@@ -48,7 +48,7 @@ class DummyDevice : public Device {
|
||||
{
|
||||
}
|
||||
|
||||
virtual void mem_copy_from(device_memory &, int, int, int, int) override
|
||||
virtual void mem_copy_from(device_memory &, size_t, size_t, size_t, size_t) override
|
||||
{
|
||||
}
|
||||
|
||||
|
276
intern/cycles/device/hip/device.cpp
Normal file
276
intern/cycles/device/hip/device.cpp
Normal file
@@ -0,0 +1,276 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "device/hip/device.h"
|
||||
|
||||
#include "util/util_logging.h"
|
||||
|
||||
#ifdef WITH_HIP
|
||||
# include "device/device.h"
|
||||
# include "device/hip/device_impl.h"
|
||||
|
||||
# include "util/util_string.h"
|
||||
# include "util/util_windows.h"
|
||||
#endif /* WITH_HIP */
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
bool device_hip_init()
|
||||
{
|
||||
#if !defined(WITH_HIP)
|
||||
return false;
|
||||
#elif defined(WITH_HIP_DYNLOAD)
|
||||
static bool initialized = false;
|
||||
static bool result = false;
|
||||
|
||||
if (initialized)
|
||||
return result;
|
||||
|
||||
initialized = true;
|
||||
int hipew_result = hipewInit(HIPEW_INIT_HIP);
|
||||
if (hipew_result == HIPEW_SUCCESS) {
|
||||
VLOG(1) << "HIPEW initialization succeeded";
|
||||
if (HIPDevice::have_precompiled_kernels()) {
|
||||
VLOG(1) << "Found precompiled kernels";
|
||||
result = true;
|
||||
}
|
||||
else if (hipewCompilerPath() != NULL) {
|
||||
VLOG(1) << "Found HIPCC " << hipewCompilerPath();
|
||||
result = true;
|
||||
}
|
||||
else {
|
||||
VLOG(1) << "Neither precompiled kernels nor HIPCC was found,"
|
||||
<< " unable to use HIP";
|
||||
}
|
||||
}
|
||||
else {
|
||||
VLOG(1) << "HIPEW initialization failed: "
|
||||
<< ((hipew_result == HIPEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
|
||||
"Error opening the library");
|
||||
}
|
||||
|
||||
return result;
|
||||
#else /* WITH_HIP_DYNLOAD */
|
||||
return true;
|
||||
#endif /* WITH_HIP_DYNLOAD */
|
||||
}
|
||||
|
||||
Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
|
||||
{
|
||||
#ifdef WITH_HIP
|
||||
return new HIPDevice(info, stats, profiler);
|
||||
#else
|
||||
(void)info;
|
||||
(void)stats;
|
||||
(void)profiler;
|
||||
|
||||
LOG(FATAL) << "Request to create HIP device without compiled-in support. Should never happen.";
|
||||
|
||||
return nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef WITH_HIP
|
||||
static hipError_t device_hip_safe_init()
|
||||
{
|
||||
# ifdef _WIN32
|
||||
__try {
|
||||
return hipInit(0);
|
||||
}
|
||||
__except (EXCEPTION_EXECUTE_HANDLER) {
|
||||
/* Ignore crashes inside the HIP driver and hope we can
|
||||
* survive even with corrupted HIP installs. */
|
||||
fprintf(stderr, "Cycles HIP: driver crashed, continuing without HIP.\n");
|
||||
}
|
||||
|
||||
return hipErrorNoDevice;
|
||||
# else
|
||||
return hipInit(0);
|
||||
# endif
|
||||
}
|
||||
#endif /* WITH_HIP */
|
||||
|
||||
void device_hip_info(vector<DeviceInfo> &devices)
|
||||
{
|
||||
#ifdef WITH_HIP
|
||||
hipError_t result = device_hip_safe_init();
|
||||
if (result != hipSuccess) {
|
||||
if (result != hipErrorNoDevice)
|
||||
fprintf(stderr, "HIP hipInit: %s\n", hipewErrorString(result));
|
||||
return;
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
result = hipGetDeviceCount(&count);
|
||||
if (result != hipSuccess) {
|
||||
fprintf(stderr, "HIP hipGetDeviceCount: %s\n", hipewErrorString(result));
|
||||
return;
|
||||
}
|
||||
|
||||
vector<DeviceInfo> display_devices;
|
||||
|
||||
for (int num = 0; num < count; num++) {
|
||||
char name[256];
|
||||
|
||||
result = hipDeviceGetName(name, 256, num);
|
||||
if (result != hipSuccess) {
|
||||
fprintf(stderr, "HIP :hipDeviceGetName: %s\n", hipewErrorString(result));
|
||||
continue;
|
||||
}
|
||||
|
||||
int major;
|
||||
hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, num);
|
||||
// TODO : (Arya) What is the last major version we are supporting?
|
||||
|
||||
DeviceInfo info;
|
||||
|
||||
info.type = DEVICE_HIP;
|
||||
info.description = string(name);
|
||||
info.num = num;
|
||||
|
||||
info.has_half_images = (major >= 3);
|
||||
info.has_nanovdb = true;
|
||||
info.denoisers = 0;
|
||||
|
||||
info.has_gpu_queue = true;
|
||||
/* Check if the device has P2P access to any other device in the system. */
|
||||
for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
|
||||
if (num != peer_num) {
|
||||
int can_access = 0;
|
||||
hipDeviceCanAccessPeer(&can_access, num, peer_num);
|
||||
info.has_peer_memory = (can_access != 0);
|
||||
}
|
||||
}
|
||||
|
||||
int pci_location[3] = {0, 0, 0};
|
||||
hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num);
|
||||
hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num);
|
||||
hipDeviceGetAttribute(&pci_location[2], hipDeviceAttributePciDeviceId, num);
|
||||
info.id = string_printf("HIP_%s_%04x:%02x:%02x",
|
||||
name,
|
||||
(unsigned int)pci_location[0],
|
||||
(unsigned int)pci_location[1],
|
||||
(unsigned int)pci_location[2]);
|
||||
|
||||
/* If device has a kernel timeout and no compute preemption, we assume
|
||||
* it is connected to a display and will freeze the display while doing
|
||||
* computations. */
|
||||
int timeout_attr = 0, preempt_attr = 0;
|
||||
hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num);
|
||||
|
||||
if (timeout_attr && !preempt_attr) {
|
||||
VLOG(1) << "Device is recognized as display.";
|
||||
info.description += " (Display)";
|
||||
info.display_device = true;
|
||||
display_devices.push_back(info);
|
||||
}
|
||||
else {
|
||||
VLOG(1) << "Device has compute preemption or is not used for display.";
|
||||
devices.push_back(info);
|
||||
}
|
||||
VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
|
||||
}
|
||||
|
||||
if (!display_devices.empty())
|
||||
devices.insert(devices.end(), display_devices.begin(), display_devices.end());
|
||||
#else /* WITH_HIP */
|
||||
(void)devices;
|
||||
#endif /* WITH_HIP */
|
||||
}
|
||||
|
||||
string device_hip_capabilities()
|
||||
{
|
||||
#ifdef WITH_HIP
|
||||
hipError_t result = device_hip_safe_init();
|
||||
if (result != hipSuccess) {
|
||||
if (result != hipErrorNoDevice) {
|
||||
return string("Error initializing HIP: ") + hipewErrorString(result);
|
||||
}
|
||||
return "No HIP device found\n";
|
||||
}
|
||||
|
||||
int count;
|
||||
result = hipGetDeviceCount(&count);
|
||||
if (result != hipSuccess) {
|
||||
return string("Error getting devices: ") + hipewErrorString(result);
|
||||
}
|
||||
|
||||
string capabilities = "";
|
||||
for (int num = 0; num < count; num++) {
|
||||
char name[256];
|
||||
if (hipDeviceGetName(name, 256, num) != hipSuccess) {
|
||||
continue;
|
||||
}
|
||||
capabilities += string("\t") + name + "\n";
|
||||
int value;
|
||||
# define GET_ATTR(attr) \
|
||||
{ \
|
||||
if (hipDeviceGetAttribute(&value, hipDeviceAttribute##attr, num) == hipSuccess) { \
|
||||
capabilities += string_printf("\t\thipDeviceAttribute" #attr "\t\t\t%d\n", value); \
|
||||
} \
|
||||
} \
|
||||
(void)0
|
||||
/* TODO(sergey): Strip all attributes which are not useful for us
|
||||
* or does not depend on the driver.
|
||||
*/
|
||||
GET_ATTR(MaxThreadsPerBlock);
|
||||
GET_ATTR(MaxBlockDimX);
|
||||
GET_ATTR(MaxBlockDimY);
|
||||
GET_ATTR(MaxBlockDimZ);
|
||||
GET_ATTR(MaxGridDimX);
|
||||
GET_ATTR(MaxGridDimY);
|
||||
GET_ATTR(MaxGridDimZ);
|
||||
GET_ATTR(MaxSharedMemoryPerBlock);
|
||||
GET_ATTR(TotalConstantMemory);
|
||||
GET_ATTR(WarpSize);
|
||||
GET_ATTR(MaxPitch);
|
||||
GET_ATTR(MaxRegistersPerBlock);
|
||||
GET_ATTR(ClockRate);
|
||||
GET_ATTR(TextureAlignment);
|
||||
GET_ATTR(MultiprocessorCount);
|
||||
GET_ATTR(KernelExecTimeout);
|
||||
GET_ATTR(Integrated);
|
||||
GET_ATTR(CanMapHostMemory);
|
||||
GET_ATTR(ComputeMode);
|
||||
GET_ATTR(MaxTexture1DWidth);
|
||||
GET_ATTR(MaxTexture2DWidth);
|
||||
GET_ATTR(MaxTexture2DHeight);
|
||||
GET_ATTR(MaxTexture3DWidth);
|
||||
GET_ATTR(MaxTexture3DHeight);
|
||||
GET_ATTR(MaxTexture3DDepth);
|
||||
GET_ATTR(ConcurrentKernels);
|
||||
GET_ATTR(EccEnabled);
|
||||
GET_ATTR(MemoryClockRate);
|
||||
GET_ATTR(MemoryBusWidth);
|
||||
GET_ATTR(L2CacheSize);
|
||||
GET_ATTR(MaxThreadsPerMultiProcessor);
|
||||
GET_ATTR(ComputeCapabilityMajor);
|
||||
GET_ATTR(ComputeCapabilityMinor);
|
||||
GET_ATTR(MaxSharedMemoryPerMultiprocessor);
|
||||
GET_ATTR(ManagedMemory);
|
||||
GET_ATTR(IsMultiGpuBoard);
|
||||
# undef GET_ATTR
|
||||
capabilities += "\n";
|
||||
}
|
||||
|
||||
return capabilities;
|
||||
|
||||
#else /* WITH_HIP */
|
||||
return "";
|
||||
#endif /* WITH_HIP */
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
37
intern/cycles/device/hip/device.h
Normal file
37
intern/cycles/device/hip/device.h
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "util/util_string.h"
|
||||
#include "util/util_vector.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class Device;
|
||||
class DeviceInfo;
|
||||
class Profiler;
|
||||
class Stats;
|
||||
|
||||
bool device_hip_init();
|
||||
|
||||
Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
|
||||
|
||||
void device_hip_info(vector<DeviceInfo> &devices);
|
||||
|
||||
string device_hip_capabilities();
|
||||
|
||||
CCL_NAMESPACE_END
|
1343
intern/cycles/device/hip/device_impl.cpp
Normal file
1343
intern/cycles/device/hip/device_impl.cpp
Normal file
File diff suppressed because it is too large
Load Diff
153
intern/cycles/device/hip/device_impl.h
Normal file
153
intern/cycles/device/hip/device_impl.h
Normal file
@@ -0,0 +1,153 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifdef WITH_HIP
|
||||
|
||||
# include "device/device.h"
|
||||
# include "device/hip/kernel.h"
|
||||
# include "device/hip/queue.h"
|
||||
# include "device/hip/util.h"
|
||||
|
||||
# include "util/util_map.h"
|
||||
|
||||
# ifdef WITH_HIP_DYNLOAD
|
||||
# include "hipew.h"
|
||||
# else
|
||||
# include "util/util_opengl.h"
|
||||
# endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class DeviceQueue;
|
||||
|
||||
class HIPDevice : public Device {
|
||||
|
||||
friend class HIPContextScope;
|
||||
|
||||
public:
|
||||
hipDevice_t hipDevice;
|
||||
hipCtx_t hipContext;
|
||||
hipModule_t hipModule;
|
||||
size_t device_texture_headroom;
|
||||
size_t device_working_headroom;
|
||||
bool move_texture_to_host;
|
||||
size_t map_host_used;
|
||||
size_t map_host_limit;
|
||||
int can_map_host;
|
||||
int pitch_alignment;
|
||||
int hipDevId;
|
||||
int hipDevArchitecture;
|
||||
bool first_error;
|
||||
|
||||
struct HIPMem {
|
||||
HIPMem() : texobject(0), array(0), use_mapped_host(false)
|
||||
{
|
||||
}
|
||||
|
||||
hipTextureObject_t texobject;
|
||||
hArray array;
|
||||
|
||||
/* If true, a mapped host memory in shared_pointer is being used. */
|
||||
bool use_mapped_host;
|
||||
};
|
||||
typedef map<device_memory *, HIPMem> HIPMemMap;
|
||||
HIPMemMap hip_mem_map;
|
||||
thread_mutex hip_mem_map_mutex;
|
||||
|
||||
/* Bindless Textures */
|
||||
device_vector<TextureInfo> texture_info;
|
||||
bool need_texture_info;
|
||||
|
||||
HIPDeviceKernels kernels;
|
||||
|
||||
static bool have_precompiled_kernels();
|
||||
|
||||
virtual bool show_samples() const override;
|
||||
|
||||
virtual BVHLayoutMask get_bvh_layout_mask() const override;
|
||||
|
||||
void set_error(const string &error) override;
|
||||
|
||||
HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
|
||||
|
||||
virtual ~HIPDevice();
|
||||
|
||||
bool support_device(const uint /*kernel_features*/);
|
||||
|
||||
bool check_peer_access(Device *peer_device) override;
|
||||
|
||||
bool use_adaptive_compilation();
|
||||
|
||||
virtual string compile_kernel_get_common_cflags(const uint kernel_features);
|
||||
|
||||
string compile_kernel(const uint kernel_features,
|
||||
const char *name,
|
||||
const char *base = "hip",
|
||||
bool force_ptx = false);
|
||||
|
||||
virtual bool load_kernels(const uint kernel_features) override;
|
||||
void reserve_local_memory(const uint kernel_features);
|
||||
|
||||
void init_host_memory();
|
||||
|
||||
void load_texture_info();
|
||||
|
||||
void move_textures_to_host(size_t size, bool for_texture);
|
||||
|
||||
HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
|
||||
|
||||
void generic_copy_to(device_memory &mem);
|
||||
|
||||
void generic_free(device_memory &mem);
|
||||
|
||||
void mem_alloc(device_memory &mem) override;
|
||||
|
||||
void mem_copy_to(device_memory &mem) override;
|
||||
|
||||
void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
|
||||
|
||||
void mem_zero(device_memory &mem) override;
|
||||
|
||||
void mem_free(device_memory &mem) override;
|
||||
|
||||
device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
|
||||
|
||||
virtual void const_copy_to(const char *name, void *host, size_t size) override;
|
||||
|
||||
void global_alloc(device_memory &mem);
|
||||
|
||||
void global_free(device_memory &mem);
|
||||
|
||||
void tex_alloc(device_texture &mem);
|
||||
|
||||
void tex_free(device_texture &mem);
|
||||
|
||||
/* Graphics resources interoperability. */
|
||||
virtual bool should_use_graphics_interop() override;
|
||||
|
||||
virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
|
||||
|
||||
int get_num_multiprocessors();
|
||||
int get_max_num_threads_per_multiprocessor();
|
||||
|
||||
protected:
|
||||
bool get_device_attribute(hipDeviceAttribute_t attribute, int *value);
|
||||
int get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value);
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
93
intern/cycles/device/hip/graphics_interop.cpp
Normal file
93
intern/cycles/device/hip/graphics_interop.cpp
Normal file
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifdef WITH_HIP
|
||||
|
||||
# include "device/hip/graphics_interop.h"
|
||||
|
||||
# include "device/hip/device_impl.h"
|
||||
# include "device/hip/util.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
HIPDeviceGraphicsInterop::HIPDeviceGraphicsInterop(HIPDeviceQueue *queue)
|
||||
: queue_(queue), device_(static_cast<HIPDevice *>(queue->device))
|
||||
{
|
||||
}
|
||||
|
||||
HIPDeviceGraphicsInterop::~HIPDeviceGraphicsInterop()
|
||||
{
|
||||
HIPContextScope scope(device_);
|
||||
|
||||
if (hip_graphics_resource_) {
|
||||
hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_));
|
||||
}
|
||||
}
|
||||
|
||||
void HIPDeviceGraphicsInterop::set_destination(const DeviceGraphicsInteropDestination &destination)
|
||||
{
|
||||
const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
|
||||
|
||||
if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
|
||||
return;
|
||||
}
|
||||
|
||||
HIPContextScope scope(device_);
|
||||
|
||||
if (hip_graphics_resource_) {
|
||||
hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_));
|
||||
}
|
||||
|
||||
const hipError_t result = hipGraphicsGLRegisterBuffer(
|
||||
&hip_graphics_resource_, destination.opengl_pbo_id, hipGraphicsRegisterFlagsNone);
|
||||
if (result != hipSuccess) {
|
||||
LOG(ERROR) << "Error registering OpenGL buffer: " << hipewErrorString(result);
|
||||
}
|
||||
|
||||
opengl_pbo_id_ = destination.opengl_pbo_id;
|
||||
buffer_area_ = new_buffer_area;
|
||||
}
|
||||
|
||||
device_ptr HIPDeviceGraphicsInterop::map()
|
||||
{
|
||||
if (!hip_graphics_resource_) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
HIPContextScope scope(device_);
|
||||
|
||||
hipDeviceptr_t hip_buffer;
|
||||
size_t bytes;
|
||||
|
||||
hip_device_assert(device_,
|
||||
hipGraphicsMapResources(1, &hip_graphics_resource_, queue_->stream()));
|
||||
hip_device_assert(
|
||||
device_, hipGraphicsResourceGetMappedPointer(&hip_buffer, &bytes, hip_graphics_resource_));
|
||||
|
||||
return static_cast<device_ptr>(hip_buffer);
|
||||
}
|
||||
|
||||
void HIPDeviceGraphicsInterop::unmap()
|
||||
{
|
||||
HIPContextScope scope(device_);
|
||||
|
||||
hip_device_assert(device_,
|
||||
hipGraphicsUnmapResources(1, &hip_graphics_resource_, queue_->stream()));
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
61
intern/cycles/device/hip/graphics_interop.h
Normal file
61
intern/cycles/device/hip/graphics_interop.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifdef WITH_HIP
|
||||
|
||||
# include "device/device_graphics_interop.h"
|
||||
|
||||
# ifdef WITH_HIP_DYNLOAD
|
||||
# include "hipew.h"
|
||||
# endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class HIPDevice;
|
||||
class HIPDeviceQueue;
|
||||
|
||||
class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop {
|
||||
public:
|
||||
explicit HIPDeviceGraphicsInterop(HIPDeviceQueue *queue);
|
||||
|
||||
HIPDeviceGraphicsInterop(const HIPDeviceGraphicsInterop &other) = delete;
|
||||
HIPDeviceGraphicsInterop(HIPDeviceGraphicsInterop &&other) noexcept = delete;
|
||||
|
||||
~HIPDeviceGraphicsInterop();
|
||||
|
||||
HIPDeviceGraphicsInterop &operator=(const HIPDeviceGraphicsInterop &other) = delete;
|
||||
HIPDeviceGraphicsInterop &operator=(HIPDeviceGraphicsInterop &&other) = delete;
|
||||
|
||||
virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
|
||||
|
||||
virtual device_ptr map() override;
|
||||
virtual void unmap() override;
|
||||
|
||||
protected:
|
||||
HIPDeviceQueue *queue_ = nullptr;
|
||||
HIPDevice *device_ = nullptr;
|
||||
|
||||
/* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
|
||||
uint opengl_pbo_id_ = 0;
|
||||
/* Buffer area in pixels of the corresponding PBO. */
|
||||
int64_t buffer_area_ = 0;
|
||||
|
||||
hipGraphicsResource hip_graphics_resource_ = nullptr;
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif
|
69
intern/cycles/device/hip/kernel.cpp
Normal file
69
intern/cycles/device/hip/kernel.cpp
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifdef WITH_HIP
|
||||
|
||||
# include "device/hip/kernel.h"
|
||||
# include "device/hip/device_impl.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
void HIPDeviceKernels::load(HIPDevice *device)
|
||||
{
|
||||
hipModule_t hipModule = device->hipModule;
|
||||
|
||||
for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
|
||||
HIPDeviceKernel &kernel = kernels_[i];
|
||||
|
||||
/* No megakernel used for GPU. */
|
||||
if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::string function_name = std::string("kernel_gpu_") +
|
||||
device_kernel_as_string((DeviceKernel)i);
|
||||
hip_device_assert(device,
|
||||
hipModuleGetFunction(&kernel.function, hipModule, function_name.c_str()));
|
||||
|
||||
if (kernel.function) {
|
||||
hip_device_assert(device, hipFuncSetCacheConfig(kernel.function, hipFuncCachePreferL1));
|
||||
|
||||
hip_device_assert(
|
||||
device,
|
||||
hipModuleOccupancyMaxPotentialBlockSize(
|
||||
&kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, 0, 0));
|
||||
}
|
||||
else {
|
||||
LOG(ERROR) << "Unable to load kernel " << function_name;
|
||||
}
|
||||
}
|
||||
|
||||
loaded = true;
|
||||
}
|
||||
|
||||
const HIPDeviceKernel &HIPDeviceKernels::get(DeviceKernel kernel) const
|
||||
{
|
||||
return kernels_[(int)kernel];
|
||||
}
|
||||
|
||||
bool HIPDeviceKernels::available(DeviceKernel kernel) const
|
||||
{
|
||||
return kernels_[(int)kernel].function != nullptr;
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* WITH_HIP*/
|
54
intern/cycles/device/hip/kernel.h
Normal file
54
intern/cycles/device/hip/kernel.h
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef WITH_HIP
|
||||
|
||||
# include "device/device_kernel.h"
|
||||
|
||||
# ifdef WITH_HIP_DYNLOAD
|
||||
# include "hipew.h"
|
||||
# endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class HIPDevice;
|
||||
|
||||
/* HIP kernel and associate occupancy information. */
|
||||
class HIPDeviceKernel {
|
||||
public:
|
||||
hipFunction_t function = nullptr;
|
||||
|
||||
int num_threads_per_block = 0;
|
||||
int min_blocks = 0;
|
||||
};
|
||||
|
||||
/* Cache of HIP kernels for each DeviceKernel. */
|
||||
class HIPDeviceKernels {
|
||||
public:
|
||||
void load(HIPDevice *device);
|
||||
const HIPDeviceKernel &get(DeviceKernel kernel) const;
|
||||
bool available(DeviceKernel kernel) const;
|
||||
|
||||
protected:
|
||||
HIPDeviceKernel kernels_[DEVICE_KERNEL_NUM];
|
||||
bool loaded = false;
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* WITH_HIP */
|
209
intern/cycles/device/hip/queue.cpp
Normal file
209
intern/cycles/device/hip/queue.cpp
Normal file
@@ -0,0 +1,209 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifdef WITH_HIP
|
||||
|
||||
# include "device/hip/queue.h"
|
||||
|
||||
# include "device/hip/device_impl.h"
|
||||
# include "device/hip/graphics_interop.h"
|
||||
# include "device/hip/kernel.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* HIPDeviceQueue */
|
||||
|
||||
HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device)
|
||||
: DeviceQueue(device), hip_device_(device), hip_stream_(nullptr)
|
||||
{
|
||||
const HIPContextScope scope(hip_device_);
|
||||
hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking));
|
||||
}
|
||||
|
||||
HIPDeviceQueue::~HIPDeviceQueue()
|
||||
{
|
||||
const HIPContextScope scope(hip_device_);
|
||||
hipStreamDestroy(hip_stream_);
|
||||
}
|
||||
|
||||
int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
|
||||
{
|
||||
/* TODO: compute automatically. */
|
||||
/* TODO: must have at least num_threads_per_block. */
|
||||
return 14416128;
|
||||
}
|
||||
|
||||
int HIPDeviceQueue::num_concurrent_busy_states() const
|
||||
{
|
||||
const int max_num_threads = hip_device_->get_num_multiprocessors() *
|
||||
hip_device_->get_max_num_threads_per_multiprocessor();
|
||||
|
||||
if (max_num_threads == 0) {
|
||||
return 65536;
|
||||
}
|
||||
|
||||
return 4 * max_num_threads;
|
||||
}
|
||||
|
||||
void HIPDeviceQueue::init_execution()
|
||||
{
|
||||
/* Synchronize all textures and memory copies before executing task. */
|
||||
HIPContextScope scope(hip_device_);
|
||||
hip_device_->load_texture_info();
|
||||
hip_device_assert(hip_device_, hipDeviceSynchronize());
|
||||
|
||||
debug_init_execution();
|
||||
}
|
||||
|
||||
bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const
|
||||
{
|
||||
return hip_device_->kernels.available(kernel);
|
||||
}
|
||||
|
||||
bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
|
||||
{
|
||||
if (hip_device_->have_error()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
debug_enqueue(kernel, work_size);
|
||||
|
||||
const HIPContextScope scope(hip_device_);
|
||||
const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel);
|
||||
|
||||
/* Compute kernel launch parameters. */
|
||||
const int num_threads_per_block = hip_kernel.num_threads_per_block;
|
||||
const int num_blocks = divide_up(work_size, num_threads_per_block);
|
||||
|
||||
int shared_mem_bytes = 0;
|
||||
|
||||
switch (kernel) {
|
||||
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
|
||||
case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
|
||||
/* See parall_active_index.h for why this amount of shared memory is needed. */
|
||||
shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/* Launch kernel. */
|
||||
hip_device_assert(hip_device_,
|
||||
hipModuleLaunchKernel(hip_kernel.function,
|
||||
num_blocks,
|
||||
1,
|
||||
1,
|
||||
num_threads_per_block,
|
||||
1,
|
||||
1,
|
||||
shared_mem_bytes,
|
||||
hip_stream_,
|
||||
args,
|
||||
0));
|
||||
return !(hip_device_->have_error());
|
||||
}
|
||||
|
||||
bool HIPDeviceQueue::synchronize()
|
||||
{
|
||||
if (hip_device_->have_error()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const HIPContextScope scope(hip_device_);
|
||||
hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));
|
||||
debug_synchronize();
|
||||
|
||||
return !(hip_device_->have_error());
|
||||
}
|
||||
|
||||
void HIPDeviceQueue::zero_to_device(device_memory &mem)
|
||||
{
|
||||
assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
|
||||
|
||||
if (mem.memory_size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Allocate on demand. */
|
||||
if (mem.device_pointer == 0) {
|
||||
hip_device_->mem_alloc(mem);
|
||||
}
|
||||
|
||||
/* Zero memory on device. */
|
||||
assert(mem.device_pointer != 0);
|
||||
|
||||
const HIPContextScope scope(hip_device_);
|
||||
hip_device_assert(
|
||||
hip_device_,
|
||||
hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));
|
||||
}
|
||||
|
||||
void HIPDeviceQueue::copy_to_device(device_memory &mem)
|
||||
{
|
||||
assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
|
||||
|
||||
if (mem.memory_size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Allocate on demand. */
|
||||
if (mem.device_pointer == 0) {
|
||||
hip_device_->mem_alloc(mem);
|
||||
}
|
||||
|
||||
assert(mem.device_pointer != 0);
|
||||
assert(mem.host_pointer != nullptr);
|
||||
|
||||
/* Copy memory to device. */
|
||||
const HIPContextScope scope(hip_device_);
|
||||
hip_device_assert(
|
||||
hip_device_,
|
||||
hipMemcpyHtoDAsync(
|
||||
(hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));
|
||||
}
|
||||
|
||||
void HIPDeviceQueue::copy_from_device(device_memory &mem)
|
||||
{
|
||||
assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
|
||||
|
||||
if (mem.memory_size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
assert(mem.device_pointer != 0);
|
||||
assert(mem.host_pointer != nullptr);
|
||||
|
||||
/* Copy memory from device. */
|
||||
const HIPContextScope scope(hip_device_);
|
||||
hip_device_assert(
|
||||
hip_device_,
|
||||
hipMemcpyDtoHAsync(
|
||||
mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));
|
||||
}
|
||||
|
||||
// TODO : (Arya) Enable this after stabilizing dev branch
|
||||
unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
|
||||
{
|
||||
return make_unique<HIPDeviceGraphicsInterop>(this);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* WITH_HIP */
|
68
intern/cycles/device/hip/queue.h
Normal file
68
intern/cycles/device/hip/queue.h
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef WITH_HIP
|
||||
|
||||
# include "device/device_kernel.h"
|
||||
# include "device/device_memory.h"
|
||||
# include "device/device_queue.h"
|
||||
|
||||
# include "device/hip/util.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class HIPDevice;
|
||||
class device_memory;
|
||||
|
||||
/* Base class for HIP queues. */
|
||||
class HIPDeviceQueue : public DeviceQueue {
|
||||
public:
|
||||
HIPDeviceQueue(HIPDevice *device);
|
||||
~HIPDeviceQueue();
|
||||
|
||||
virtual int num_concurrent_states(const size_t state_size) const override;
|
||||
virtual int num_concurrent_busy_states() const override;
|
||||
|
||||
virtual void init_execution() override;
|
||||
|
||||
virtual bool kernel_available(DeviceKernel kernel) const override;
|
||||
|
||||
virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
|
||||
|
||||
virtual bool synchronize() override;
|
||||
|
||||
virtual void zero_to_device(device_memory &mem) override;
|
||||
virtual void copy_to_device(device_memory &mem) override;
|
||||
virtual void copy_from_device(device_memory &mem) override;
|
||||
|
||||
virtual hipStream_t stream()
|
||||
{
|
||||
return hip_stream_;
|
||||
}
|
||||
|
||||
// TODO : (Arya) Enable this after stabilizing the dev branch
|
||||
virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
|
||||
|
||||
protected:
|
||||
HIPDevice *hip_device_;
|
||||
hipStream_t hip_stream_;
|
||||
};
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* WITH_HIP */
|
61
intern/cycles/device/hip/util.cpp
Normal file
61
intern/cycles/device/hip/util.cpp
Normal file
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifdef WITH_HIP
|
||||
|
||||
# include "device/hip/util.h"
|
||||
# include "device/hip/device_impl.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
HIPContextScope::HIPContextScope(HIPDevice *device) : device(device)
|
||||
{
|
||||
hip_device_assert(device, hipCtxPushCurrent(device->hipContext));
|
||||
}
|
||||
|
||||
HIPContextScope::~HIPContextScope()
|
||||
{
|
||||
hip_device_assert(device, hipCtxPopCurrent(NULL));
|
||||
}
|
||||
|
||||
# ifndef WITH_HIP_DYNLOAD
|
||||
const char *hipewErrorString(hipError_t result)
|
||||
{
|
||||
/* We can only give error code here without major code duplication, that
|
||||
* should be enough since dynamic loading is only being disabled by folks
|
||||
* who knows what they're doing anyway.
|
||||
*
|
||||
* NOTE: Avoid call from several threads.
|
||||
*/
|
||||
static string error;
|
||||
error = string_printf("%d", result);
|
||||
return error.c_str();
|
||||
}
|
||||
|
||||
const char *hipewCompilerPath()
|
||||
{
|
||||
return CYCLES_HIP_HIPCC_EXECUTABLE;
|
||||
}
|
||||
|
||||
int hipewCompilerVersion()
|
||||
{
|
||||
return (HIP_VERSION / 100) + (HIP_VERSION % 100 / 10);
|
||||
}
|
||||
# endif
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* WITH_HIP */
|
63
intern/cycles/device/hip/util.h
Normal file
63
intern/cycles/device/hip/util.h
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef WITH_HIP
|
||||
|
||||
# ifdef WITH_HIP_DYNLOAD
|
||||
# include "hipew.h"
|
||||
# endif
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
class HIPDevice;
|
||||
|
||||
/* Utility to push/pop HIP context. */
|
||||
class HIPContextScope {
|
||||
public:
|
||||
HIPContextScope(HIPDevice *device);
|
||||
~HIPContextScope();
|
||||
|
||||
private:
|
||||
HIPDevice *device;
|
||||
};
|
||||
|
||||
/* Utility for checking return values of HIP function calls. */
|
||||
# define hip_device_assert(hip_device, stmt) \
|
||||
{ \
|
||||
hipError_t result = stmt; \
|
||||
if (result != hipSuccess) { \
|
||||
const char *name = hipewErrorString(result); \
|
||||
hip_device->set_error( \
|
||||
string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
|
||||
} \
|
||||
} \
|
||||
(void)0
|
||||
|
||||
# define hip_assert(stmt) hip_device_assert(this, stmt)
|
||||
|
||||
# ifndef WITH_HIP_DYNLOAD
|
||||
/* Transparently implement some functions, so majority of the file does not need
|
||||
* to worry about difference between dynamically loaded and linked HIP at all. */
|
||||
const char *hipewErrorString(hipError_t result);
|
||||
const char *hipewCompilerPath();
|
||||
int hipewCompilerVersion();
|
||||
# endif /* WITH_HIP_DYNLOAD */
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* WITH_HIP */
|
@@ -315,14 +315,14 @@ class MultiDevice : public Device {
|
||||
stats.mem_alloc(mem.device_size - existing_size);
|
||||
}
|
||||
|
||||
void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
|
||||
void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override
|
||||
{
|
||||
device_ptr key = mem.device_pointer;
|
||||
int i = 0, sub_h = h / devices.size();
|
||||
size_t i = 0, sub_h = h / devices.size();
|
||||
|
||||
foreach (SubDevice &sub, devices) {
|
||||
int sy = y + i * sub_h;
|
||||
int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
|
||||
size_t sy = y + i * sub_h;
|
||||
size_t sh = (i == (size_t)devices.size() - 1) ? h - sub_h * i : sub_h;
|
||||
|
||||
SubDevice *owner_sub = find_matching_mem_device(key, sub);
|
||||
mem.device = owner_sub->device;
|
||||
|
@@ -315,6 +315,11 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
|
||||
group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
|
||||
group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
|
||||
group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
|
||||
group_descs[PG_HITV].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
|
||||
group_descs[PG_HITV].hitgroup.moduleCH = optix_module;
|
||||
group_descs[PG_HITV].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
|
||||
group_descs[PG_HITV].hitgroup.moduleAH = optix_module;
|
||||
group_descs[PG_HITV].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_volume_test";
|
||||
|
||||
if (kernel_features & KERNEL_FEATURE_HAIR) {
|
||||
if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
|
||||
@@ -397,6 +402,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
|
||||
trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
|
||||
trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
|
||||
trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
|
||||
trace_css = std::max(trace_css, stack_size[PG_HITV].cssIS + stack_size[PG_HITV].cssAH);
|
||||
trace_css = std::max(trace_css,
|
||||
stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
|
||||
trace_css = std::max(trace_css,
|
||||
@@ -421,6 +427,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
|
||||
pipeline_groups.push_back(groups[PG_HITD]);
|
||||
pipeline_groups.push_back(groups[PG_HITS]);
|
||||
pipeline_groups.push_back(groups[PG_HITL]);
|
||||
pipeline_groups.push_back(groups[PG_HITV]);
|
||||
if (motion_blur) {
|
||||
pipeline_groups.push_back(groups[PG_HITD_MOTION]);
|
||||
pipeline_groups.push_back(groups[PG_HITS_MOTION]);
|
||||
@@ -459,6 +466,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
|
||||
pipeline_groups.push_back(groups[PG_HITD]);
|
||||
pipeline_groups.push_back(groups[PG_HITS]);
|
||||
pipeline_groups.push_back(groups[PG_HITL]);
|
||||
pipeline_groups.push_back(groups[PG_HITV]);
|
||||
if (motion_blur) {
|
||||
pipeline_groups.push_back(groups[PG_HITD_MOTION]);
|
||||
pipeline_groups.push_back(groups[PG_HITS_MOTION]);
|
||||
@@ -1390,25 +1398,33 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
||||
/* Set user instance ID to object index (but leave low bit blank). */
|
||||
instance.instanceId = ob->get_device_index() << 1;
|
||||
|
||||
/* Have to have at least one bit in the mask, or else instance would always be culled. */
|
||||
instance.visibilityMask = 1;
|
||||
|
||||
if (ob->get_geometry()->has_volume) {
|
||||
/* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
|
||||
/* Add some of the object visibility bits to the mask.
|
||||
* __prim_visibility contains the combined visibility bits of all instances, so is not
|
||||
* reliable if they differ between instances. But the OptiX visibility mask can only contain
|
||||
* 8 bits, so have to trade-off here and select just a few important ones.
|
||||
*/
|
||||
instance.visibilityMask |= 2;
|
||||
instance.visibilityMask = ob->visibility_for_tracing() & 0xFF;
|
||||
|
||||
/* Have to have at least one bit in the mask, or else instance would always be culled. */
|
||||
if (0 == instance.visibilityMask) {
|
||||
instance.visibilityMask = 0xFF;
|
||||
}
|
||||
|
||||
if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
|
||||
/* Same applies to curves (so they can be skipped in local trace calls). */
|
||||
instance.visibilityMask |= 4;
|
||||
|
||||
if (motion_blur && ob->get_geometry()->has_motion_blur() &&
|
||||
if (ob->get_geometry()->geometry_type == Geometry::HAIR &&
|
||||
static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
|
||||
if (motion_blur && ob->get_geometry()->has_motion_blur()) {
|
||||
/* Select between motion blur and non-motion blur built-in intersection module. */
|
||||
instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves,
|
||||
* since it needs to filter out endcaps there).
|
||||
* It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit
|
||||
* programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
|
||||
*/
|
||||
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT;
|
||||
}
|
||||
|
||||
/* Insert motion traversable if object has motion. */
|
||||
if (motion_blur && ob->use_motion()) {
|
||||
@@ -1474,7 +1490,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
||||
delete[] reinterpret_cast<uint8_t *>(&motion_transform);
|
||||
|
||||
/* Disable instance transform if object uses motion transform already. */
|
||||
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
|
||||
instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
|
||||
|
||||
/* Get traversable handle to motion transform. */
|
||||
optixConvertPointerToTraversableHandle(context,
|
||||
@@ -1491,7 +1507,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
|
||||
}
|
||||
else {
|
||||
/* Disable instance transform if geometry already has it applied to vertex data. */
|
||||
instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
|
||||
instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
|
||||
/* Non-instanced objects read ID from 'prim_object', so distinguish
|
||||
* them from instanced objects with the low bit set. */
|
||||
instance.instanceId |= 1;
|
||||
|
@@ -40,6 +40,7 @@ enum {
|
||||
PG_HITD, /* Default hit group. */
|
||||
PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
|
||||
PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */
|
||||
PG_HITV, /* __VOLUME__ hit group. */
|
||||
PG_HITD_MOTION,
|
||||
PG_HITS_MOTION,
|
||||
PG_CALL_SVM_AO,
|
||||
@@ -51,7 +52,7 @@ enum {
|
||||
static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
|
||||
static const int NUM_MIS_PROGRAM_GROUPS = 1;
|
||||
static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
|
||||
static const int NUM_HIT_PROGRAM_GROUPS = 5;
|
||||
static const int NUM_HIT_PROGRAM_GROUPS = 6;
|
||||
static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
|
||||
static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;
|
||||
|
||||
|
@@ -244,7 +244,7 @@ static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>>
|
||||
const int slice_height = max(lround(height * weight), 1);
|
||||
|
||||
/* Disallow negative values to deal with situations when there are more compute devices than
|
||||
* scanlines. */
|
||||
* scan-lines. */
|
||||
const int remaining_height = max(0, height - current_y);
|
||||
|
||||
BufferParams slide_params = buffer_params;
|
||||
@@ -801,7 +801,7 @@ void PathTrace::tile_buffer_write_to_disk()
|
||||
}
|
||||
|
||||
if (!tile_manager_.write_tile(*buffers)) {
|
||||
LOG(ERROR) << "Error writing tile to file.";
|
||||
device_->set_error("Error writing tile to file");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -894,7 +894,14 @@ void PathTrace::process_full_buffer_from_disk(string_view filename)
|
||||
|
||||
DenoiseParams denoise_params;
|
||||
if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) {
|
||||
LOG(ERROR) << "Error reading tiles from file.";
|
||||
const string error_message = "Error reading tiles from file";
|
||||
if (progress_) {
|
||||
progress_->set_error(error_message);
|
||||
progress_->set_cancel(error_message);
|
||||
}
|
||||
else {
|
||||
LOG(ERROR) << error_message;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1028,6 +1035,8 @@ static const char *device_type_for_description(const DeviceType type)
|
||||
return "CUDA";
|
||||
case DEVICE_OPTIX:
|
||||
return "OptiX";
|
||||
case DEVICE_HIP:
|
||||
return "HIP";
|
||||
case DEVICE_DUMMY:
|
||||
return "Dummy";
|
||||
case DEVICE_MULTI:
|
||||
|
@@ -286,7 +286,7 @@ class PathTrace {
|
||||
/* Parameters of the big tile with the current resolution divider applied. */
|
||||
BufferParams effective_big_tile_params;
|
||||
|
||||
/* Denosier was run and there are denoised versions of the passes in the render buffers. */
|
||||
/* Denoiser was run and there are denoised versions of the passes in the render buffers. */
|
||||
bool has_denoised_result = false;
|
||||
|
||||
/* Current tile has been written (to either disk or callback.
|
||||
|
@@ -104,7 +104,7 @@ class PathTraceWork {
|
||||
* - Copies work's render buffer to its device. */
|
||||
void copy_from_render_buffers(const RenderBuffers *render_buffers);
|
||||
|
||||
/* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the
|
||||
/* Special version of the `copy_from_render_buffers()` which only copies denoised passes from the
|
||||
* given render buffers, leaving rest of the passes.
|
||||
*
|
||||
* Same notes about device copying applies to this call as well. */
|
||||
|
@@ -19,6 +19,8 @@
|
||||
#include "device/cpu/kernel.h"
|
||||
#include "device/device.h"
|
||||
|
||||
#include "kernel/kernel_path_state.h"
|
||||
|
||||
#include "integrator/pass_accessor_cpu.h"
|
||||
|
||||
#include "render/buffers.h"
|
||||
@@ -116,13 +118,17 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global
|
||||
const KernelWorkTile &work_tile,
|
||||
const int samples_num)
|
||||
{
|
||||
const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
|
||||
const bool has_bake = device_scene_->data.bake.use;
|
||||
|
||||
IntegratorStateCPU integrator_states[2] = {};
|
||||
IntegratorStateCPU integrator_states[2];
|
||||
|
||||
IntegratorStateCPU *state = &integrator_states[0];
|
||||
IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
|
||||
IntegratorStateCPU *shadow_catcher_state = nullptr;
|
||||
|
||||
if (device_scene_->data.integrator.has_shadow_catcher) {
|
||||
shadow_catcher_state = &integrator_states[1];
|
||||
path_state_init_queues(kernel_globals, shadow_catcher_state);
|
||||
}
|
||||
|
||||
KernelWorkTile sample_work_tile = work_tile;
|
||||
float *render_buffer = buffers_->buffer.data();
|
||||
@@ -147,7 +153,7 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global
|
||||
|
||||
kernels_.integrator_megakernel(kernel_globals, state, render_buffer);
|
||||
|
||||
if (has_shadow_catcher) {
|
||||
if (shadow_catcher_state) {
|
||||
kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
|
||||
}
|
||||
|
||||
|
@@ -95,8 +95,8 @@ void PathTraceWorkGPU::alloc_integrator_soa()
|
||||
#define KERNEL_STRUCT_END(name) \
|
||||
break; \
|
||||
}
|
||||
#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
|
||||
if (array_index == array_size - 1) { \
|
||||
#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
|
||||
if (array_index == gpu_array_size - 1) { \
|
||||
break; \
|
||||
} \
|
||||
}
|
||||
@@ -738,7 +738,8 @@ void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,
|
||||
|
||||
get_render_tile_film_pixels(destination, pass_mode, num_samples);
|
||||
|
||||
gpu_display_rgba_half_.copy_from_device();
|
||||
queue_->copy_from_device(gpu_display_rgba_half_);
|
||||
queue_->synchronize();
|
||||
|
||||
gpu_display->copy_pixels_to_texture(
|
||||
gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
|
||||
|
@@ -384,7 +384,7 @@ bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
|
||||
}
|
||||
|
||||
if (denoiser_params_.use && !state_.last_work_tile_was_denoised) {
|
||||
render_work->tile.denoise = true;
|
||||
render_work->tile.denoise = !tile_manager_.has_multiple_tiles();
|
||||
any_scheduled = true;
|
||||
}
|
||||
|
||||
@@ -903,6 +903,12 @@ bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* When multiple tiles are used the full frame will be denoised.
|
||||
* Avoid per-tile denoising to save up render time. */
|
||||
if (tile_manager_.has_multiple_tiles()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (done()) {
|
||||
/* Always denoise at the last sample. */
|
||||
return true;
|
||||
|
@@ -31,7 +31,7 @@ class RenderWork {
|
||||
int resolution_divider = 1;
|
||||
|
||||
/* Initialize render buffers.
|
||||
* Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the
|
||||
* Includes steps like zeroing the buffer on the device, and optional reading of pixels from the
|
||||
* baking target. */
|
||||
bool init_render_buffers = false;
|
||||
|
||||
|
@@ -149,14 +149,14 @@ bool ShaderEval::eval_gpu(Device *device,
|
||||
|
||||
/* Execute work on GPU in chunk, so we can cancel.
|
||||
* TODO : query appropriate size from device.*/
|
||||
const int chunk_size = 65536;
|
||||
const int64_t chunk_size = 65536;
|
||||
|
||||
const int work_size = output.size();
|
||||
const int64_t work_size = output.size();
|
||||
void *d_input = (void *)input.device_pointer;
|
||||
void *d_output = (void *)output.device_pointer;
|
||||
|
||||
for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
|
||||
int d_work_size = min(chunk_size, work_size - d_offset);
|
||||
for (int64_t d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
|
||||
int64_t d_work_size = std::min(chunk_size, work_size - d_offset);
|
||||
void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};
|
||||
|
||||
queue->enqueue(kernel, d_work_size, args);
|
||||
|
@@ -35,6 +35,10 @@ set(SRC_DEVICE_CUDA
|
||||
device/cuda/kernel.cu
|
||||
)
|
||||
|
||||
set(SRC_DEVICE_HIP
|
||||
device/hip/kernel.cpp
|
||||
)
|
||||
|
||||
set(SRC_DEVICE_OPTIX
|
||||
device/optix/kernel.cu
|
||||
device/optix/kernel_shader_raytrace.cu
|
||||
@@ -106,6 +110,12 @@ set(SRC_DEVICE_CUDA_HEADERS
|
||||
device/cuda/globals.h
|
||||
)
|
||||
|
||||
set(SRC_DEVICE_HIP_HEADERS
|
||||
device/hip/compat.h
|
||||
device/hip/config.h
|
||||
device/hip/globals.h
|
||||
)
|
||||
|
||||
set(SRC_DEVICE_OPTIX_HEADERS
|
||||
device/optix/compat.h
|
||||
device/optix/globals.h
|
||||
@@ -458,6 +468,104 @@ if(WITH_CYCLES_CUDA_BINARIES)
|
||||
cycles_set_solution_folder(cycles_kernel_cuda)
|
||||
endif()
|
||||
|
||||
####################################################### START
|
||||
|
||||
# HIP module
|
||||
|
||||
if(WITH_CYCLES_HIP_BINARIES)
|
||||
# 64 bit only
|
||||
set(HIP_BITS 64)
|
||||
|
||||
# HIP version
|
||||
execute_process(COMMAND ${HIP_HIPCC_EXECUTABLE} "--version" OUTPUT_VARIABLE HIPCC_OUT)
|
||||
string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" HIP_VERSION_MAJOR "${HIPCC_OUT}")
|
||||
string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" HIP_VERSION_MINOR "${HIPCC_OUT}")
|
||||
set(HIP_VERSION "${HIP_VERSION_MAJOR}${HIP_VERSION_MINOR}")
|
||||
|
||||
|
||||
message(WARNING
|
||||
"HIP version ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR} detected")
|
||||
|
||||
# build for each arch
|
||||
set(hip_sources device/hip/kernel.cpp
|
||||
${SRC_HEADERS}
|
||||
${SRC_DEVICE_HIP_HEADERS}
|
||||
${SRC_BVH_HEADERS}
|
||||
${SRC_SVM_HEADERS}
|
||||
${SRC_GEOM_HEADERS}
|
||||
${SRC_INTEGRATOR_HEADERS}
|
||||
${SRC_CLOSURE_HEADERS}
|
||||
${SRC_UTIL_HEADERS}
|
||||
)
|
||||
set(hip_fatbins)
|
||||
|
||||
macro(CYCLES_HIP_KERNEL_ADD arch prev_arch name flags sources experimental)
|
||||
if(${arch} MATCHES "compute_.*")
|
||||
set(format "ptx")
|
||||
else()
|
||||
set(format "fatbin")
|
||||
endif()
|
||||
set(hip_file ${name}_${arch}.${format})
|
||||
|
||||
set(kernel_sources ${sources})
|
||||
if(NOT ${prev_arch} STREQUAL "none")
|
||||
if(${prev_arch} MATCHES "compute_.*")
|
||||
set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx)
|
||||
else()
|
||||
set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.fatbin)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(hip_kernel_src "/device/hip/${name}.cpp")
|
||||
|
||||
set(hip_flags ${flags}
|
||||
-D CCL_NAMESPACE_BEGIN=
|
||||
-D CCL_NAMESPACE_END=
|
||||
-D HIPCC
|
||||
-m ${HIP_BITS}
|
||||
-I ${CMAKE_CURRENT_SOURCE_DIR}/..
|
||||
-I ${CMAKE_CURRENT_SOURCE_DIR}/device/hip
|
||||
--use_fast_math
|
||||
-o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file})
|
||||
|
||||
if(${experimental})
|
||||
set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__)
|
||||
set(name ${name}_experimental)
|
||||
endif()
|
||||
|
||||
if(WITH_CYCLES_DEBUG)
|
||||
set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__)
|
||||
endif()
|
||||
|
||||
if(WITH_NANOVDB)
|
||||
set(hip_flags ${hip_flags}
|
||||
-D WITH_NANOVDB
|
||||
-I "${NANOVDB_INCLUDE_DIR}")
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
set(prev_arch "none")
|
||||
foreach(arch ${CYCLES_HIP_BINARIES_ARCH})
|
||||
set(hip_hipcc_executable ${HIP_HIPCC_EXECUTABLE})
|
||||
set(hip_toolkit_root_dir ${HIP_TOOLKIT_ROOT_DIR})
|
||||
if(DEFINED hip_hipcc_executable AND DEFINED hip_toolkit_root_dir)
|
||||
# Compile regular kernel
|
||||
CYCLES_HIP_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${hip_sources}" FALSE)
|
||||
|
||||
if(WITH_CYCLES_HIP_BUILD_SERIAL)
|
||||
set(prev_arch ${arch})
|
||||
endif()
|
||||
|
||||
unset(hip_hipcc_executable)
|
||||
unset(hip_toolkit_root_dir)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
add_custom_target(cycles_kernel_hip ALL DEPENDS ${hip_fatbins})
|
||||
cycles_set_solution_folder(cycles_kernel_hip)
|
||||
endif()
|
||||
|
||||
####################################################### END
|
||||
# OptiX PTX modules
|
||||
|
||||
if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
|
||||
@@ -602,11 +710,13 @@ endif()
|
||||
cycles_add_library(cycles_kernel "${LIB}"
|
||||
${SRC_DEVICE_CPU}
|
||||
${SRC_DEVICE_CUDA}
|
||||
${SRC_DEVICE_HIP}
|
||||
${SRC_DEVICE_OPTIX}
|
||||
${SRC_HEADERS}
|
||||
${SRC_DEVICE_CPU_HEADERS}
|
||||
${SRC_DEVICE_GPU_HEADERS}
|
||||
${SRC_DEVICE_CUDA_HEADERS}
|
||||
${SRC_DEVICE_HIP_HEADERS}
|
||||
${SRC_DEVICE_OPTIX_HEADERS}
|
||||
${SRC_BVH_HEADERS}
|
||||
${SRC_CLOSURE_HEADERS}
|
||||
@@ -621,6 +731,7 @@ source_group("geom" FILES ${SRC_GEOM_HEADERS})
|
||||
source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
|
||||
source_group("kernel" FILES ${SRC_HEADERS})
|
||||
source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
|
||||
source_group("device\\hip" FILES ${SRC_DEVICE_HIP} ${SRC_DEVICE_HIP_HEADERS})
|
||||
source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
|
||||
source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
|
||||
source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
|
||||
@@ -632,14 +743,19 @@ endif()
|
||||
if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
|
||||
add_dependencies(cycles_kernel cycles_kernel_optix)
|
||||
endif()
|
||||
if(WITH_CYCLES_HIP)
|
||||
add_dependencies(cycles_kernel cycles_kernel_hip)
|
||||
endif()
|
||||
|
||||
# Install kernel source for runtime compilation
|
||||
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
|
||||
delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
|
||||
|
@@ -167,15 +167,25 @@ ccl_device_intersect bool scene_intersect(const KernelGlobals *kg,
|
||||
uint p4 = visibility;
|
||||
uint p5 = PRIMITIVE_NONE;
|
||||
|
||||
uint ray_mask = visibility & 0xFF;
|
||||
uint ray_flags = OPTIX_RAY_FLAG_NONE;
|
||||
if (0 == ray_mask && (visibility & ~0xFF) != 0) {
|
||||
ray_mask = 0xFF;
|
||||
ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT;
|
||||
}
|
||||
else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
|
||||
ray_flags = OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
|
||||
}
|
||||
|
||||
optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
|
||||
ray->P,
|
||||
ray->D,
|
||||
0.0f,
|
||||
ray->t,
|
||||
ray->time,
|
||||
0xF,
|
||||
OPTIX_RAY_FLAG_NONE,
|
||||
0, // SBT offset for PG_HITD
|
||||
ray_mask,
|
||||
ray_flags,
|
||||
0, /* SBT offset for PG_HITD */
|
||||
0,
|
||||
0,
|
||||
p0,
|
||||
@@ -251,11 +261,11 @@ ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
|
||||
uint p2 = ((uint64_t)local_isect) & 0xFFFFFFFF;
|
||||
uint p3 = (((uint64_t)local_isect) >> 32) & 0xFFFFFFFF;
|
||||
uint p4 = local_object;
|
||||
// Is set to zero on miss or if ray is aborted, so can be used as return value
|
||||
/* Is set to zero on miss or if ray is aborted, so can be used as return value. */
|
||||
uint p5 = max_hits;
|
||||
|
||||
if (local_isect) {
|
||||
local_isect->num_hits = 0; // Initialize hit count to zero
|
||||
local_isect->num_hits = 0; /* Initialize hit count to zero. */
|
||||
}
|
||||
optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
|
||||
ray->P,
|
||||
@@ -263,11 +273,10 @@ ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
|
||||
0.0f,
|
||||
ray->t,
|
||||
ray->time,
|
||||
// Skip curves
|
||||
0x3,
|
||||
// Need to always call into __anyhit__kernel_optix_local_hit
|
||||
0xFF,
|
||||
/* Need to always call into __anyhit__kernel_optix_local_hit. */
|
||||
OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
|
||||
2, // SBT offset for PG_HITL
|
||||
2, /* SBT offset for PG_HITL */
|
||||
0,
|
||||
0,
|
||||
p0,
|
||||
@@ -365,17 +374,22 @@ ccl_device_intersect bool scene_intersect_shadow_all(const KernelGlobals *kg,
|
||||
uint p4 = visibility;
|
||||
uint p5 = false;
|
||||
|
||||
*num_hits = 0; // Initialize hit count to zero
|
||||
uint ray_mask = visibility & 0xFF;
|
||||
if (0 == ray_mask && (visibility & ~0xFF) != 0) {
|
||||
ray_mask = 0xFF;
|
||||
}
|
||||
|
||||
*num_hits = 0; /* Initialize hit count to zero. */
|
||||
optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
|
||||
ray->P,
|
||||
ray->D,
|
||||
0.0f,
|
||||
ray->t,
|
||||
ray->time,
|
||||
0xF,
|
||||
// Need to always call into __anyhit__kernel_optix_shadow_all_hit
|
||||
ray_mask,
|
||||
/* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */
|
||||
OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
|
||||
1, // SBT offset for PG_HITS
|
||||
1, /* SBT offset for PG_HITS */
|
||||
0,
|
||||
0,
|
||||
p0,
|
||||
@@ -444,16 +458,21 @@ ccl_device_intersect bool scene_intersect_volume(const KernelGlobals *kg,
|
||||
uint p4 = visibility;
|
||||
uint p5 = PRIMITIVE_NONE;
|
||||
|
||||
uint ray_mask = visibility & 0xFF;
|
||||
if (0 == ray_mask && (visibility & ~0xFF) != 0) {
|
||||
ray_mask = 0xFF;
|
||||
}
|
||||
|
||||
optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
|
||||
ray->P,
|
||||
ray->D,
|
||||
0.0f,
|
||||
ray->t,
|
||||
ray->time,
|
||||
// Skip everything but volumes
|
||||
0x2,
|
||||
OPTIX_RAY_FLAG_NONE,
|
||||
0, // SBT offset for PG_HITD
|
||||
ray_mask,
|
||||
/* Need to always call into __anyhit__kernel_optix_volume_test. */
|
||||
OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
|
||||
3, /* SBT offset for PG_HITV */
|
||||
0,
|
||||
0,
|
||||
p0,
|
||||
|
@@ -21,11 +21,15 @@ CCL_NAMESPACE_BEGIN
|
||||
/* Given an array of states, build an array of indices for which the states
|
||||
* are active.
|
||||
*
|
||||
* Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
|
||||
* Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
|
||||
|
||||
#include "util/util_atomic.h"
|
||||
|
||||
#ifdef __HIP__
|
||||
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
|
||||
#else
|
||||
# define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
|
||||
#endif
|
||||
|
||||
template<uint blocksize, typename IsActiveOp>
|
||||
__device__ void gpu_parallel_active_index_array(const uint num_states,
|
||||
|
@@ -27,7 +27,11 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
#include "util/util_atomic.h"
|
||||
|
||||
#ifdef __HIP__
|
||||
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
|
||||
#else
|
||||
# define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
|
||||
#endif
|
||||
|
||||
template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
|
||||
{
|
||||
|
@@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN
|
||||
* the overall cost of the algorithm while keeping the work complexity O(n) and
|
||||
* the step complexity O(log n). (Brent's Theorem optimization) */
|
||||
|
||||
#ifdef __HIP__
|
||||
# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024
|
||||
#else
|
||||
# define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
|
||||
#endif
|
||||
|
||||
template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
|
||||
__device__ void gpu_parallel_sum(
|
||||
|
@@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN
|
||||
|
||||
#include "util/util_atomic.h"
|
||||
|
||||
#ifdef __HIP__
|
||||
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
|
||||
#else
|
||||
# define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
|
||||
#endif
|
||||
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)
|
||||
|
||||
template<uint blocksize, typename GetKeyOp>
|
||||
|
121
intern/cycles/kernel/device/hip/compat.h
Normal file
121
intern/cycles/kernel/device/hip/compat.h
Normal file
@@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#define __KERNEL_GPU__
|
||||
#define __KERNEL_HIP__
|
||||
#define CCL_NAMESPACE_BEGIN
|
||||
#define CCL_NAMESPACE_END
|
||||
|
||||
#ifndef ATTR_FALLTHROUGH
|
||||
# define ATTR_FALLTHROUGH
|
||||
#endif
|
||||
|
||||
#ifdef __HIPCC_RTC__
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
#else
|
||||
# include <stdint.h>
|
||||
#endif
|
||||
|
||||
#ifdef CYCLES_HIPBIN_CC
|
||||
# define FLT_MIN 1.175494350822287507969e-38f
|
||||
# define FLT_MAX 340282346638528859811704183484516925440.0f
|
||||
# define FLT_EPSILON 1.192092896e-07F
|
||||
#endif
|
||||
|
||||
/* Qualifiers */
|
||||
|
||||
#define ccl_device __device__ __inline__
|
||||
#define ccl_device_inline __device__ __inline__
|
||||
#define ccl_device_forceinline __device__ __forceinline__
|
||||
#define ccl_device_noinline __device__ __noinline__
|
||||
#define ccl_device_noinline_cpu ccl_device
|
||||
#define ccl_global
|
||||
#define ccl_static_constant __constant__
|
||||
#define ccl_device_constant __constant__ __device__
|
||||
#define ccl_constant const
|
||||
#define ccl_gpu_shared __shared__
|
||||
#define ccl_private
|
||||
#define ccl_may_alias
|
||||
#define ccl_addr_space
|
||||
#define ccl_restrict __restrict__
|
||||
#define ccl_loop_no_unroll
|
||||
#define ccl_align(n) __align__(n)
|
||||
#define ccl_optional_struct_init
|
||||
|
||||
#define kernel_assert(cond)
|
||||
|
||||
/* Types */
|
||||
#ifdef __HIP__
|
||||
# include "hip/hip_fp16.h"
|
||||
# include "hip/hip_runtime.h"
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
# include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#define ccl_gpu_thread_idx_x (threadIdx.x)
|
||||
#define ccl_gpu_block_dim_x (blockDim.x)
|
||||
#define ccl_gpu_block_idx_x (blockIdx.x)
|
||||
#define ccl_gpu_grid_dim_x (gridDim.x)
|
||||
#define ccl_gpu_warp_size (warpSize)
|
||||
|
||||
#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
|
||||
#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
|
||||
|
||||
/* GPU warp synchronization */
|
||||
|
||||
#define ccl_gpu_syncthreads() __syncthreads()
|
||||
#define ccl_gpu_ballot(predicate) __ballot(predicate)
|
||||
#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
|
||||
#define ccl_gpu_popc(x) __popc(x)
|
||||
|
||||
/* GPU texture objects */
|
||||
typedef hipTextureObject_t ccl_gpu_tex_object;
|
||||
|
||||
template<typename T>
|
||||
ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
|
||||
const float x,
|
||||
const float y)
|
||||
{
|
||||
return tex2D<T>(texobj, x, y);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
|
||||
const float x,
|
||||
const float y,
|
||||
const float z)
|
||||
{
|
||||
return tex3D<T>(texobj, x, y, z);
|
||||
}
|
||||
|
||||
/* Use fast math functions */
|
||||
|
||||
#define cosf(x) __cosf(((float)(x)))
|
||||
#define sinf(x) __sinf(((float)(x)))
|
||||
#define powf(x, y) __powf(((float)(x)), ((float)(y)))
|
||||
#define tanf(x) __tanf(((float)(x)))
|
||||
#define logf(x) __logf(((float)(x)))
|
||||
#define expf(x) __expf(((float)(x)))
|
||||
|
||||
/* Types */
|
||||
|
||||
#include "util/util_half.h"
|
||||
#include "util/util_types.h"
|
57
intern/cycles/kernel/device/hip/config.h
Normal file
57
intern/cycles/kernel/device/hip/config.h
Normal file
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Device data taken from HIP occupancy calculator.
|
||||
*
|
||||
* Terminology
|
||||
* - HIP GPUs have multiple streaming multiprocessors
|
||||
* - Each multiprocessor executes multiple thread blocks
|
||||
* - Each thread block contains a number of threads, also known as the block size
|
||||
* - Multiprocessors have a fixed number of registers, and the amount of registers
|
||||
* used by each threads limits the number of threads per block.
|
||||
*/
|
||||
|
||||
/* Launch Bound Definitions */
|
||||
#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
|
||||
#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64
|
||||
#define GPU_BLOCK_MAX_THREADS 1024
|
||||
#define GPU_THREAD_MAX_REGISTERS 255
|
||||
|
||||
#define GPU_KERNEL_BLOCK_NUM_THREADS 1024
|
||||
#define GPU_KERNEL_MAX_REGISTERS 64
|
||||
|
||||
/* Compute number of threads per block and minimum blocks per multiprocessor
|
||||
* given the maximum number of registers per thread. */
|
||||
|
||||
#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
|
||||
extern "C" __global__ void __launch_bounds__(block_num_threads, \
|
||||
GPU_MULTIPRESSOR_MAX_REGISTERS / \
|
||||
(block_num_threads * thread_num_registers))
|
||||
|
||||
/* sanity checks */
|
||||
|
||||
#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
|
||||
# error "Maximum number of threads per block exceeded"
|
||||
#endif
|
||||
|
||||
#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
|
||||
GPU_MULTIPROCESSOR_MAX_BLOCKS
|
||||
# error "Maximum number of blocks per multiprocessor exceeded"
|
||||
#endif
|
||||
|
||||
#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
|
||||
# error "Maximum number of registers per thread exceeded"
|
||||
#endif
|
49
intern/cycles/kernel/device/hip/globals.h
Normal file
49
intern/cycles/kernel/device/hip/globals.h
Normal file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Constant Globals */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "kernel/kernel_profiling.h"
|
||||
#include "kernel/kernel_types.h"
|
||||
|
||||
#include "kernel/integrator/integrator_state.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Not actually used, just a NULL pointer that gets passed everywhere, which we
|
||||
* hope gets optimized out by the compiler. */
|
||||
struct KernelGlobals {
|
||||
/* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
|
||||
int unused[1];
|
||||
};
|
||||
|
||||
/* Global scene data and textures */
|
||||
__constant__ KernelData __data;
|
||||
#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name;
|
||||
#include "kernel/kernel_textures.h"
|
||||
|
||||
/* Integrator state */
|
||||
__constant__ IntegratorStateGPU __integrator_state;
|
||||
|
||||
/* Abstraction macros */
|
||||
#define kernel_data __data
|
||||
#define kernel_tex_fetch(t, index) t[(index)]
|
||||
#define kernel_tex_array(t) (t)
|
||||
#define kernel_integrator_state __integrator_state
|
||||
|
||||
CCL_NAMESPACE_END
|
28
intern/cycles/kernel/device/hip/kernel.cpp
Normal file
28
intern/cycles/kernel/device/hip/kernel.cpp
Normal file
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* Copyright 2011-2021 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* HIP kernel entry points */
|
||||
|
||||
#ifdef __HIP_DEVICE_COMPILE__
|
||||
|
||||
# include "kernel/device/hip/compat.h"
|
||||
# include "kernel/device/hip/config.h"
|
||||
# include "kernel/device/hip/globals.h"
|
||||
|
||||
# include "kernel/device/gpu/image.h"
|
||||
# include "kernel/device/gpu/kernel.h"
|
||||
|
||||
#endif
|
@@ -19,7 +19,7 @@
|
||||
#include "kernel/device/optix/compat.h"
|
||||
#include "kernel/device/optix/globals.h"
|
||||
|
||||
#include "kernel/device/gpu/image.h" // Texture lookup uses normal CUDA intrinsics
|
||||
#include "kernel/device/gpu/image.h" /* Texture lookup uses normal CUDA intrinsics. */
|
||||
|
||||
#include "kernel/integrator/integrator_state.h"
|
||||
#include "kernel/integrator/integrator_state_flow.h"
|
||||
@@ -44,18 +44,18 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
|
||||
template<bool always = false> ccl_device_forceinline uint get_object_id()
|
||||
{
|
||||
#ifdef __OBJECT_MOTION__
|
||||
// Always get the the instance ID from the TLAS
|
||||
// There might be a motion transform node between TLAS and BLAS which does not have one
|
||||
/* Always get the the instance ID from the TLAS.
|
||||
* There might be a motion transform node between TLAS and BLAS which does not have one. */
|
||||
uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
|
||||
#else
|
||||
uint object = optixGetInstanceId();
|
||||
#endif
|
||||
// Choose between always returning object ID or only for instances
|
||||
/* Choose between always returning object ID or only for instances. */
|
||||
if (always || (object & 1) == 0)
|
||||
// Can just remove the low bit since instance always contains object ID
|
||||
/* Can just remove the low bit since instance always contains object ID. */
|
||||
return object >> 1;
|
||||
else
|
||||
// Set to OBJECT_NONE if this is not an instanced object
|
||||
/* Set to OBJECT_NONE if this is not an instanced object. */
|
||||
return OBJECT_NONE;
|
||||
}
|
||||
|
||||
@@ -93,23 +93,30 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_st
|
||||
|
||||
extern "C" __global__ void __miss__kernel_optix_miss()
|
||||
{
|
||||
// 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss
|
||||
/* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
|
||||
optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
|
||||
optixSetPayload_5(PRIMITIVE_NONE);
|
||||
}
|
||||
|
||||
extern "C" __global__ void __anyhit__kernel_optix_local_hit()
|
||||
{
|
||||
#ifdef __HAIR__
|
||||
if (!optixIsTriangleHit()) {
|
||||
/* Ignore curves. */
|
||||
return optixIgnoreIntersection();
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __BVH_LOCAL__
|
||||
const uint object = get_object_id<true>();
|
||||
if (object != optixGetPayload_4() /* local_object */) {
|
||||
// Only intersect with matching object
|
||||
/* Only intersect with matching object. */
|
||||
return optixIgnoreIntersection();
|
||||
}
|
||||
|
||||
const uint max_hits = optixGetPayload_5();
|
||||
if (max_hits == 0) {
|
||||
// Special case for when no hit information is requested, just report that something was hit
|
||||
/* Special case for when no hit information is requested, just report that something was hit */
|
||||
optixSetPayload_5(true);
|
||||
return optixTerminateRay();
|
||||
}
|
||||
@@ -136,8 +143,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
|
||||
}
|
||||
else {
|
||||
if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
|
||||
// Record closest intersection only
|
||||
// Do not terminate ray here, since there is no guarantee about distance ordering in any-hit
|
||||
/* Record closest intersection only.
|
||||
* Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
|
||||
*/
|
||||
return optixIgnoreIntersection();
|
||||
}
|
||||
|
||||
@@ -154,14 +162,14 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
|
||||
isect->u = 1.0f - barycentrics.y - barycentrics.x;
|
||||
isect->v = barycentrics.x;
|
||||
|
||||
// Record geometric normal
|
||||
/* Record geometric normal. */
|
||||
const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
|
||||
const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0));
|
||||
const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1));
|
||||
const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2));
|
||||
local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));
|
||||
|
||||
// Continue tracing (without this the trace call would return after the first hit)
|
||||
/* Continue tracing (without this the trace call would return after the first hit). */
|
||||
optixIgnoreIntersection();
|
||||
#endif
|
||||
}
|
||||
@@ -190,7 +198,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
|
||||
u = __uint_as_float(optixGetAttribute_0());
|
||||
v = __uint_as_float(optixGetAttribute_1());
|
||||
|
||||
// Filter out curve endcaps
|
||||
/* Filter out curve endcaps. */
|
||||
if (u == 0.0f || u == 1.0f) {
|
||||
ignore_intersection = true;
|
||||
}
|
||||
@@ -241,10 +249,10 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
|
||||
isect->type = kernel_tex_fetch(__prim_type, prim);
|
||||
|
||||
# ifdef __TRANSPARENT_SHADOWS__
|
||||
// Detect if this surface has a shader with transparent shadows
|
||||
/* Detect if this surface has a shader with transparent shadows. */
|
||||
if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
|
||||
# endif
|
||||
// If no transparent shadows, all light is blocked and we can stop immediately
|
||||
/* If no transparent shadows, all light is blocked and we can stop immediately. */
|
||||
optixSetPayload_5(true);
|
||||
return optixTerminateRay();
|
||||
# ifdef __TRANSPARENT_SHADOWS__
|
||||
@@ -252,24 +260,39 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
|
||||
# endif
|
||||
}
|
||||
|
||||
// Continue tracing
|
||||
/* Continue tracing. */
|
||||
optixIgnoreIntersection();
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
|
||||
extern "C" __global__ void __anyhit__kernel_optix_volume_test()
|
||||
{
|
||||
uint visibility = optixGetPayload_4();
|
||||
#ifdef __HAIR__
|
||||
if (!optixIsTriangleHit()) {
|
||||
/* Ignore curves. */
|
||||
return optixIgnoreIntersection();
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
const uint prim = optixGetPrimitiveIndex();
|
||||
const uint visibility = optixGetPayload_4();
|
||||
if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
|
||||
return optixIgnoreIntersection();
|
||||
}
|
||||
#endif
|
||||
|
||||
const uint object = get_object_id<true>();
|
||||
if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
|
||||
return optixIgnoreIntersection();
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
|
||||
{
|
||||
#ifdef __HAIR__
|
||||
if (!optixIsTriangleHit()) {
|
||||
// Filter out curve endcaps
|
||||
/* Filter out curve endcaps. */
|
||||
const float u = __uint_as_float(optixGetAttribute_0());
|
||||
if (u == 0.0f || u == 1.0f) {
|
||||
return optixIgnoreIntersection();
|
||||
@@ -277,18 +300,26 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
|
||||
}
|
||||
#endif
|
||||
|
||||
// Shadow ray early termination
|
||||
#ifdef __VISIBILITY_FLAG__
|
||||
const uint prim = optixGetPrimitiveIndex();
|
||||
const uint visibility = optixGetPayload_4();
|
||||
if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
|
||||
return optixIgnoreIntersection();
|
||||
}
|
||||
|
||||
/* Shadow ray early termination. */
|
||||
if (visibility & PATH_RAY_SHADOW_OPAQUE) {
|
||||
return optixTerminateRay();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" __global__ void __closesthit__kernel_optix_hit()
|
||||
{
|
||||
optixSetPayload_0(__float_as_uint(optixGetRayTmax())); // Intersection distance
|
||||
optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
|
||||
optixSetPayload_3(optixGetPrimitiveIndex());
|
||||
optixSetPayload_4(get_object_id());
|
||||
// Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index
|
||||
/* Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index. */
|
||||
optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex()));
|
||||
|
||||
if (optixIsTriangleHit()) {
|
||||
@@ -297,7 +328,7 @@ extern "C" __global__ void __closesthit__kernel_optix_hit()
|
||||
optixSetPayload_2(__float_as_uint(barycentrics.x));
|
||||
}
|
||||
else {
|
||||
optixSetPayload_1(optixGetAttribute_0()); // Same as 'optixGetCurveParameter()'
|
||||
optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
|
||||
optixSetPayload_2(optixGetAttribute_1());
|
||||
}
|
||||
}
|
||||
@@ -311,7 +342,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
|
||||
float3 P = optixGetObjectRayOrigin();
|
||||
float3 dir = optixGetObjectRayDirection();
|
||||
|
||||
// The direction is not normalized by default, but the curve intersection routine expects that
|
||||
/* The direction is not normalized by default, but the curve intersection routine expects that */
|
||||
float len;
|
||||
dir = normalize_len(dir, &len);
|
||||
|
||||
@@ -323,15 +354,15 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
|
||||
|
||||
Intersection isect;
|
||||
isect.t = optixGetRayTmax();
|
||||
// Transform maximum distance into object space
|
||||
/* Transform maximum distance into object space. */
|
||||
if (isect.t != FLT_MAX)
|
||||
isect.t *= len;
|
||||
|
||||
if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
|
||||
optixReportIntersection(isect.t / len,
|
||||
type & PRIMITIVE_ALL,
|
||||
__float_as_int(isect.u), // Attribute_0
|
||||
__float_as_int(isect.v)); // Attribute_1
|
||||
__float_as_int(isect.u), /* Attribute_0 */
|
||||
__float_as_int(isect.v)); /* Attribute_1 */
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -713,7 +713,7 @@ ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,
|
||||
|
||||
P = transform_point(&tfm, P);
|
||||
D = transform_direction(&tfm, D * t);
|
||||
D = normalize_len(D, &t);
|
||||
D = safe_normalize_len(D, &t);
|
||||
}
|
||||
|
||||
int prim = kernel_tex_fetch(__prim_index, isect_prim);
|
||||
@@ -764,8 +764,10 @@ ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,
|
||||
/* Thick curves, compute normal using direction from inside the curve.
|
||||
* This could be optimized by recording the normal in the intersection,
|
||||
* however for Optix this would go beyond the size of the payload. */
|
||||
/* NOTE: It is possible that P will be the same as P_inside (precision issues, or very small
|
||||
* radius). In this case use the view direction to approximate the normal. */
|
||||
const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
|
||||
const float3 Ng = normalize(P - P_inside);
|
||||
const float3 Ng = (!isequal_float3(P, P_inside)) ? normalize(P - P_inside) : -sd->I;
|
||||
|
||||
sd->N = Ng;
|
||||
sd->Ng = Ng;
|
||||
|
@@ -41,7 +41,18 @@ ccl_device_inline int find_attribute_motion(const KernelGlobals *kg,
|
||||
uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
|
||||
|
||||
while (attr_map.x != id) {
|
||||
if (UNLIKELY(attr_map.x == ATTR_STD_NONE)) {
|
||||
if (UNLIKELY(attr_map.y == 0)) {
|
||||
return (int)ATTR_STD_NOT_FOUND;
|
||||
}
|
||||
else {
|
||||
/* Chain jump to a different part of the table. */
|
||||
attr_offset = attr_map.z;
|
||||
}
|
||||
}
|
||||
else {
|
||||
attr_offset += ATTR_PRIM_TYPES;
|
||||
}
|
||||
attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
|
||||
}
|
||||
|
||||
|
@@ -365,9 +365,8 @@ ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
|
||||
#ifdef __VOLUME__
|
||||
if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
|
||||
#endif
|
||||
|
||||
{
|
||||
const int path_flag = INTEGRATOR_STATE(path, flag);
|
||||
|
||||
#ifdef __SUBSURFACE__
|
||||
/* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
|
||||
if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
|
||||
@@ -375,9 +374,7 @@ ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
|
||||
{
|
||||
/* Evaluate shader. */
|
||||
PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
|
||||
shader_eval_surface<node_feature_mask>(
|
||||
INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
|
||||
}
|
||||
shader_eval_surface<node_feature_mask>(INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
|
||||
}
|
||||
|
||||
#ifdef __SUBSURFACE__
|
||||
@@ -417,8 +414,10 @@ ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
|
||||
|
||||
/* Perform path termination. Most paths have already been terminated in
|
||||
* the intersect_closest kernel, this is just for emission and for dividing
|
||||
* throughput by the probability at the right moment. */
|
||||
const int path_flag = INTEGRATOR_STATE(path, flag);
|
||||
* throughput by the probability at the right moment.
|
||||
*
|
||||
* Also ensure we don't do it twice for SSS at both the entry and exit point. */
|
||||
if (!(path_flag & PATH_RAY_SUBSURFACE)) {
|
||||
const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
|
||||
0.0f :
|
||||
path_state_continuation_probability(INTEGRATOR_STATE_PASS,
|
||||
@@ -429,6 +428,7 @@ ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
|
||||
else if (probability != 1.0f) {
|
||||
INTEGRATOR_STATE_WRITE(path, throughput) /= probability;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __DENOISING_FEATURES__
|
||||
kernel_write_denoising_features_surface(INTEGRATOR_STATE_PASS, &sd, render_buffer);
|
||||
|
@@ -74,7 +74,7 @@ ccl_device_inline bool shadow_volume_shader_sample(INTEGRATOR_STATE_ARGS,
|
||||
ShaderData *ccl_restrict sd,
|
||||
float3 *ccl_restrict extinction)
|
||||
{
|
||||
shader_eval_volume(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
|
||||
shader_eval_volume<true>(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
|
||||
return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
|
||||
});
|
||||
|
||||
@@ -93,7 +93,7 @@ ccl_device_inline bool volume_shader_sample(INTEGRATOR_STATE_ARGS,
|
||||
VolumeShaderCoefficients *coeff)
|
||||
{
|
||||
const int path_flag = INTEGRATOR_STATE(path, flag);
|
||||
shader_eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
|
||||
shader_eval_volume<false>(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
|
||||
return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
|
||||
});
|
||||
|
||||
|
@@ -60,7 +60,15 @@ CCL_NAMESPACE_BEGIN
|
||||
* TODO: these could be made dynamic depending on the features used in the scene. */
|
||||
|
||||
#define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
|
||||
#define INTEGRATOR_SHADOW_ISECT_SIZE 4
|
||||
|
||||
#define INTEGRATOR_SHADOW_ISECT_SIZE_CPU 1024
|
||||
#define INTEGRATOR_SHADOW_ISECT_SIZE_GPU 4
|
||||
|
||||
#ifdef __KERNEL_CPU__
|
||||
# define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_CPU
|
||||
#else
|
||||
# define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_GPU
|
||||
#endif
|
||||
|
||||
/* Data structures */
|
||||
|
||||
@@ -74,9 +82,9 @@ typedef struct IntegratorStateCPU {
|
||||
#define KERNEL_STRUCT_END(name) \
|
||||
} \
|
||||
name;
|
||||
#define KERNEL_STRUCT_END_ARRAY(name, size) \
|
||||
#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
|
||||
} \
|
||||
name[size];
|
||||
name[cpu_size];
|
||||
#include "kernel/integrator/integrator_state_template.h"
|
||||
#undef KERNEL_STRUCT_BEGIN
|
||||
#undef KERNEL_STRUCT_MEMBER
|
||||
@@ -103,9 +111,9 @@ typedef struct IntegratorStateGPU {
|
||||
#define KERNEL_STRUCT_END(name) \
|
||||
} \
|
||||
name;
|
||||
#define KERNEL_STRUCT_END_ARRAY(name, size) \
|
||||
#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
|
||||
} \
|
||||
name[size];
|
||||
name[gpu_size];
|
||||
#include "kernel/integrator/integrator_state_template.h"
|
||||
#undef KERNEL_STRUCT_BEGIN
|
||||
#undef KERNEL_STRUCT_MEMBER
|
||||
|
@@ -107,7 +107,7 @@ KERNEL_STRUCT_END(subsurface)
|
||||
KERNEL_STRUCT_BEGIN(volume_stack)
|
||||
KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
|
||||
KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
|
||||
KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
|
||||
KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE, INTEGRATOR_VOLUME_STACK_SIZE)
|
||||
|
||||
/********************************* Shadow Path State **************************/
|
||||
|
||||
@@ -153,11 +153,15 @@ KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACIN
|
||||
KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
|
||||
/* TODO: exclude for GPU. */
|
||||
KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
|
||||
KERNEL_STRUCT_END_ARRAY(shadow_isect, INTEGRATOR_SHADOW_ISECT_SIZE)
|
||||
KERNEL_STRUCT_END_ARRAY(shadow_isect,
|
||||
INTEGRATOR_SHADOW_ISECT_SIZE_CPU,
|
||||
INTEGRATOR_SHADOW_ISECT_SIZE_GPU)
|
||||
|
||||
/**************************** Shadow Volume Stack *****************************/
|
||||
|
||||
KERNEL_STRUCT_BEGIN(shadow_volume_stack)
|
||||
KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
|
||||
KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
|
||||
KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
|
||||
KERNEL_STRUCT_END_ARRAY(shadow_volume_stack,
|
||||
INTEGRATOR_VOLUME_STACK_SIZE,
|
||||
INTEGRATOR_VOLUME_STACK_SIZE)
|
||||
|
@@ -217,10 +217,10 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state
|
||||
while (false) \
|
||||
;
|
||||
|
||||
# define KERNEL_STRUCT_END_ARRAY(name, array_size) \
|
||||
# define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
|
||||
++index; \
|
||||
} \
|
||||
while (index < array_size) \
|
||||
while (index < gpu_array_size) \
|
||||
;
|
||||
|
||||
# include "kernel/integrator/integrator_state_template.h"
|
||||
@@ -264,7 +264,12 @@ ccl_device_inline void integrator_state_shadow_catcher_split(INTEGRATOR_STATE_AR
|
||||
|
||||
IntegratorStateCPU *ccl_restrict split_state = state + 1;
|
||||
|
||||
*split_state = *state;
|
||||
/* Only copy the required subset, since shadow intersections are big and irrelevant here. */
|
||||
split_state->path = state->path;
|
||||
split_state->ray = state->ray;
|
||||
split_state->isect = state->isect;
|
||||
memcpy(split_state->volume_stack, state->volume_stack, sizeof(state->volume_stack));
|
||||
split_state->shadow_path = state->shadow_path;
|
||||
|
||||
split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS;
|
||||
#endif
|
||||
|
@@ -386,7 +386,7 @@ ccl_device_inline void kernel_accum_light(INTEGRATOR_STATE_CONST_ARGS,
|
||||
{
|
||||
/* The throughput for shadow paths already contains the light shader evaluation. */
|
||||
float3 contribution = INTEGRATOR_STATE(shadow_path, throughput);
|
||||
kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce) - 1);
|
||||
kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce));
|
||||
|
||||
ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
|
||||
render_buffer);
|
||||
|
@@ -42,6 +42,16 @@ ccl_device void kernel_displace_evaluate(const KernelGlobals *kg,
|
||||
|
||||
object_inverse_dir_transform(kg, &sd, &D);
|
||||
|
||||
#ifdef __KERNEL_DEBUG_NAN__
|
||||
if (!isfinite3_safe(D)) {
|
||||
kernel_assert(!"Cycles displacement with non-finite value detected");
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Ensure finite displacement, preventing BVH from becoming degenerate and avoiding possible
|
||||
* traversal issues caused by non-finite math. */
|
||||
D = ensure_finite3(D);
|
||||
|
||||
/* Write output. */
|
||||
output[offset] += make_float4(D.x, D.y, D.z, 0.0f);
|
||||
}
|
||||
@@ -66,7 +76,16 @@ ccl_device void kernel_background_evaluate(const KernelGlobals *kg,
|
||||
const int path_flag = PATH_RAY_EMISSION;
|
||||
shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
|
||||
INTEGRATOR_STATE_PASS_NULL, &sd, NULL, path_flag);
|
||||
const float3 color = shader_background_eval(&sd);
|
||||
float3 color = shader_background_eval(&sd);
|
||||
|
||||
#ifdef __KERNEL_DEBUG_NAN__
|
||||
if (!isfinite3_safe(color)) {
|
||||
kernel_assert(!"Cycles background with non-finite value detected");
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Ensure finite color, avoiding possible numerical instabilities in the path tracing kernels. */
|
||||
color = ensure_finite3(color);
|
||||
|
||||
/* Write output. */
|
||||
output[offset] += make_float4(color.x, color.y, color.z, 0.0f);
|
||||
|
@@ -394,7 +394,7 @@ film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_conver
|
||||
|
||||
/* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual
|
||||
* shadow catcher objects in the scene. In this case there will be no auxiliary passes required
|
||||
* for the devision (to save up memory). So delay the asserts to this point so that the number of
|
||||
* for the decision (to save up memory). So delay the asserts to this point so that the number of
|
||||
* samples check handles such configuration. */
|
||||
kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
|
||||
kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED);
|
||||
|
@@ -74,10 +74,6 @@ ccl_device_inline float cmj_randfloat_simple(uint i, uint p)
|
||||
|
||||
ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension)
|
||||
{
|
||||
/* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
|
||||
* the x part is used as the sample (TODO(@leesonw): Add using both x and y parts
|
||||
* independently). */
|
||||
|
||||
/* Perform Owen shuffle of the sample number to reorder the samples. */
|
||||
#ifdef _SIMPLE_HASH_
|
||||
const uint rv = cmj_hash_simple(dimension, rng_hash);
|
||||
@@ -95,7 +91,10 @@ ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_ha
|
||||
const uint sample_set = s / NUM_PMJ_SAMPLES;
|
||||
const uint d = (dimension + sample_set);
|
||||
const uint dim = d % NUM_PMJ_PATTERNS;
|
||||
int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
|
||||
|
||||
/* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
|
||||
* the x part is used for even dims and the y for odd. */
|
||||
int index = 2 * ((dim >> 1) * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES)) + (dim & 1);
|
||||
|
||||
float fx = kernel_tex_fetch(__sample_pattern_lut, index);
|
||||
|
||||
@@ -104,12 +103,11 @@ ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_ha
|
||||
# ifdef _SIMPLE_HASH_
|
||||
float dx = cmj_randfloat_simple(d, rng_hash);
|
||||
# else
|
||||
/* Only jitter within the grid interval. */
|
||||
float dx = cmj_randfloat(d, rng_hash);
|
||||
# endif
|
||||
fx = fx + dx * (1.0f / NUM_PMJ_SAMPLES);
|
||||
/* Jitter sample locations and map back into [0 1]. */
|
||||
fx = fx + dx;
|
||||
fx = fx - floorf(fx);
|
||||
|
||||
#else
|
||||
# warning "Not using Cranley-Patterson Rotation."
|
||||
#endif
|
||||
@@ -136,7 +134,7 @@ ccl_device void pmj_sample_2D(
|
||||
/* Based on the sample number a sample pattern is selected and offset by the dimension. */
|
||||
const uint sample_set = s / NUM_PMJ_SAMPLES;
|
||||
const uint d = (dimension + sample_set);
|
||||
const uint dim = d % NUM_PMJ_PATTERNS;
|
||||
uint dim = d % NUM_PMJ_PATTERNS;
|
||||
int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
|
||||
|
||||
float fx = kernel_tex_fetch(__sample_pattern_lut, index);
|
||||
@@ -151,17 +149,17 @@ ccl_device void pmj_sample_2D(
|
||||
float dx = cmj_randfloat(d, rng_hash);
|
||||
float dy = cmj_randfloat(d + 1, rng_hash);
|
||||
# endif
|
||||
/* Only jitter within the grid cells. */
|
||||
fx = fx + dx * (1.0f / NUM_PMJ_DIVISIONS);
|
||||
fy = fy + dy * (1.0f / NUM_PMJ_DIVISIONS);
|
||||
fx = fx - floorf(fx);
|
||||
fy = fy - floorf(fy);
|
||||
/* Jitter sample locations and map back to the unit square [0 1]x[0 1]. */
|
||||
float sx = fx + dx;
|
||||
float sy = fy + dy;
|
||||
sx = sx - floorf(sx);
|
||||
sy = sy - floorf(sy);
|
||||
#else
|
||||
# warning "Not using Cranley Patterson Rotation."
|
||||
#endif
|
||||
|
||||
(*x) = fx;
|
||||
(*y) = fy;
|
||||
(*x) = sx;
|
||||
(*y) = sy;
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
@@ -750,7 +750,7 @@ ccl_device int shader_phase_sample_closure(const KernelGlobals *kg,
|
||||
|
||||
/* Volume Evaluation */
|
||||
|
||||
template<typename StackReadOp>
|
||||
template<const bool shadow, typename StackReadOp>
|
||||
ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
|
||||
ShaderData *ccl_restrict sd,
|
||||
const int path_flag,
|
||||
@@ -815,10 +815,13 @@ ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
|
||||
# endif
|
||||
|
||||
/* Merge closures to avoid exceeding number of closures limit. */
|
||||
if (i > 0)
|
||||
if (!shadow) {
|
||||
if (i > 0) {
|
||||
shader_merge_volume_closures(sd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __VOLUME__ */
|
||||
|
||||
|
@@ -572,6 +572,7 @@ typedef enum AttributeStandard {
|
||||
ATTR_STD_MOTION_VERTEX_NORMAL,
|
||||
ATTR_STD_PARTICLE,
|
||||
ATTR_STD_CURVE_INTERCEPT,
|
||||
ATTR_STD_CURVE_LENGTH,
|
||||
ATTR_STD_CURVE_RANDOM,
|
||||
ATTR_STD_PTEX_FACE_ID,
|
||||
ATTR_STD_PTEX_UV,
|
||||
|
@@ -107,6 +107,7 @@ ustring OSLRenderServices::u_geom_undisplaced("geom:undisplaced");
|
||||
ustring OSLRenderServices::u_is_smooth("geom:is_smooth");
|
||||
ustring OSLRenderServices::u_is_curve("geom:is_curve");
|
||||
ustring OSLRenderServices::u_curve_thickness("geom:curve_thickness");
|
||||
ustring OSLRenderServices::u_curve_length("geom:curve_length");
|
||||
ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal");
|
||||
ustring OSLRenderServices::u_curve_random("geom:curve_random");
|
||||
ustring OSLRenderServices::u_path_ray_length("path:ray_length");
|
||||
|
@@ -294,6 +294,7 @@ class OSLRenderServices : public OSL::RendererServices {
|
||||
static ustring u_is_smooth;
|
||||
static ustring u_is_curve;
|
||||
static ustring u_curve_thickness;
|
||||
static ustring u_curve_length;
|
||||
static ustring u_curve_tangent_normal;
|
||||
static ustring u_curve_random;
|
||||
static ustring u_path_ray_length;
|
||||
|
@@ -18,12 +18,14 @@
|
||||
|
||||
shader node_hair_info(output float IsStrand = 0.0,
|
||||
output float Intercept = 0.0,
|
||||
output float Length = 0.0,
|
||||
output float Thickness = 0.0,
|
||||
output normal TangentNormal = N,
|
||||
output float Random = 0)
|
||||
{
|
||||
getattribute("geom:is_curve", IsStrand);
|
||||
getattribute("geom:curve_intercept", Intercept);
|
||||
getattribute("geom:curve_length", Length);
|
||||
getattribute("geom:curve_thickness", Thickness);
|
||||
getattribute("geom:curve_tangent_normal", TangentNormal);
|
||||
getattribute("geom:curve_random", Random);
|
||||
|
@@ -213,6 +213,8 @@ ccl_device_noinline void svm_node_hair_info(
|
||||
}
|
||||
case NODE_INFO_CURVE_INTERCEPT:
|
||||
break; /* handled as attribute */
|
||||
case NODE_INFO_CURVE_LENGTH:
|
||||
break; /* handled as attribute */
|
||||
case NODE_INFO_CURVE_RANDOM:
|
||||
break; /* handled as attribute */
|
||||
case NODE_INFO_CURVE_THICKNESS: {
|
||||
|
@@ -173,6 +173,7 @@ typedef enum NodeParticleInfo {
|
||||
typedef enum NodeHairInfo {
|
||||
NODE_INFO_CURVE_IS_STRAND,
|
||||
NODE_INFO_CURVE_INTERCEPT,
|
||||
NODE_INFO_CURVE_LENGTH,
|
||||
NODE_INFO_CURVE_THICKNESS,
|
||||
/* Fade for minimum hair width transiency. */
|
||||
// NODE_INFO_CURVE_FADE,
|
||||
|
@@ -342,6 +342,8 @@ const char *Attribute::standard_name(AttributeStandard std)
|
||||
return "particle";
|
||||
case ATTR_STD_CURVE_INTERCEPT:
|
||||
return "curve_intercept";
|
||||
case ATTR_STD_CURVE_LENGTH:
|
||||
return "curve_length";
|
||||
case ATTR_STD_CURVE_RANDOM:
|
||||
return "curve_random";
|
||||
case ATTR_STD_PTEX_FACE_ID:
|
||||
@@ -586,6 +588,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
|
||||
case ATTR_STD_CURVE_INTERCEPT:
|
||||
attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE_KEY);
|
||||
break;
|
||||
case ATTR_STD_CURVE_LENGTH:
|
||||
attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE);
|
||||
break;
|
||||
case ATTR_STD_CURVE_RANDOM:
|
||||
attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE);
|
||||
break;
|
||||
|
@@ -22,7 +22,6 @@
|
||||
#include "util/util_foreach.h"
|
||||
#include "util/util_hash.h"
|
||||
#include "util/util_math.h"
|
||||
#include "util/util_opengl.h"
|
||||
#include "util/util_time.h"
|
||||
#include "util/util_types.h"
|
||||
|
||||
|
@@ -434,7 +434,8 @@ void Film::update_passes(Scene *scene, bool add_sample_count_pass)
|
||||
const ObjectManager *object_manager = scene->object_manager;
|
||||
Integrator *integrator = scene->integrator;
|
||||
|
||||
if (!is_modified() && !object_manager->need_update() && !integrator->is_modified()) {
|
||||
if (!is_modified() && !object_manager->need_update() && !integrator->is_modified() &&
|
||||
!background->is_modified()) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@@ -794,11 +794,6 @@ void GeometryManager::device_update_attributes(Device *device,
|
||||
foreach (AttributeRequest &req, attributes.requests) {
|
||||
Attribute *attr = geom->attributes.find(req);
|
||||
|
||||
/* Vertex normals are stored in DeviceScene.tri_vnormal. */
|
||||
if (attr && attr->std == ATTR_STD_VERTEX_NORMAL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
update_attribute_element_size(geom,
|
||||
attr,
|
||||
ATTR_PRIM_GEOMETRY,
|
||||
@@ -811,11 +806,6 @@ void GeometryManager::device_update_attributes(Device *device,
|
||||
Mesh *mesh = static_cast<Mesh *>(geom);
|
||||
Attribute *subd_attr = mesh->subd_attributes.find(req);
|
||||
|
||||
/* Vertex normals are stored in DeviceScene.tri_vnormal. */
|
||||
if (subd_attr && subd_attr->std == ATTR_STD_VERTEX_NORMAL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
update_attribute_element_size(mesh,
|
||||
subd_attr,
|
||||
ATTR_PRIM_SUBD,
|
||||
@@ -870,11 +860,6 @@ void GeometryManager::device_update_attributes(Device *device,
|
||||
Attribute *attr = geom->attributes.find(req);
|
||||
|
||||
if (attr) {
|
||||
/* Vertex normals are stored in DeviceScene.tri_vnormal. */
|
||||
if (attr->std == ATTR_STD_VERTEX_NORMAL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* force a copy if we need to reallocate all the data */
|
||||
attr->modified |= attributes_need_realloc[Attribute::kernel_type(*attr)];
|
||||
}
|
||||
@@ -898,11 +883,6 @@ void GeometryManager::device_update_attributes(Device *device,
|
||||
Attribute *subd_attr = mesh->subd_attributes.find(req);
|
||||
|
||||
if (subd_attr) {
|
||||
/* Vertex normals are stored in DeviceScene.tri_vnormal. */
|
||||
if (subd_attr->std == ATTR_STD_VERTEX_NORMAL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* force a copy if we need to reallocate all the data */
|
||||
subd_attr->modified |= attributes_need_realloc[Attribute::kernel_type(*subd_attr)];
|
||||
}
|
||||
|
@@ -46,7 +46,7 @@ class GPUDisplayParams {
|
||||
* NOTE: Is not affected by the resolution divider. */
|
||||
int2 full_size = make_int2(0, 0);
|
||||
|
||||
/* Effective vieport size.
|
||||
/* Effective viewport size.
|
||||
* In the case of border render, size of the border rectangle.
|
||||
*
|
||||
* NOTE: Is not affected by the resolution divider. */
|
||||
@@ -163,7 +163,7 @@ class GPUDisplay {
|
||||
* This call might happen in parallel with draw, but can never happen in parallel with the
|
||||
* update.
|
||||
*
|
||||
* The actual zero-ing can be deferred to a later moment. What is important is that after clear
|
||||
* The actual zeroing can be deferred to a later moment. What is important is that after clear
|
||||
* and before pixels update the drawing texture will be fully empty, and that partial update
|
||||
* after clear will write new pixel values for an updating area, leaving everything else zeroed.
|
||||
*
|
||||
|
@@ -19,7 +19,7 @@
|
||||
|
||||
#include "kernel/kernel_types.h"
|
||||
|
||||
#include "device/device_denoise.h" /* For the paramaters and type enum. */
|
||||
#include "device/device_denoise.h" /* For the parameters and type enum. */
|
||||
#include "graph/node.h"
|
||||
#include "integrator/adaptive_sampling.h"
|
||||
|
||||
|
@@ -4368,6 +4368,7 @@ NODE_DEFINE(HairInfoNode)
|
||||
|
||||
SOCKET_OUT_FLOAT(is_strand, "Is Strand");
|
||||
SOCKET_OUT_FLOAT(intercept, "Intercept");
|
||||
SOCKET_OUT_FLOAT(size, "Length");
|
||||
SOCKET_OUT_FLOAT(thickness, "Thickness");
|
||||
SOCKET_OUT_NORMAL(tangent_normal, "Tangent Normal");
|
||||
#if 0 /* Output for minimum hair width transparency - deactivated. */
|
||||
@@ -4390,6 +4391,9 @@ void HairInfoNode::attributes(Shader *shader, AttributeRequestSet *attributes)
|
||||
if (!intercept_out->links.empty())
|
||||
attributes->add(ATTR_STD_CURVE_INTERCEPT);
|
||||
|
||||
if (!output("Length")->links.empty())
|
||||
attributes->add(ATTR_STD_CURVE_LENGTH);
|
||||
|
||||
if (!output("Random")->links.empty())
|
||||
attributes->add(ATTR_STD_CURVE_RANDOM);
|
||||
}
|
||||
@@ -4412,6 +4416,12 @@ void HairInfoNode::compile(SVMCompiler &compiler)
|
||||
compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_OUTPUT_FLOAT);
|
||||
}
|
||||
|
||||
out = output("Length");
|
||||
if (!out->links.empty()) {
|
||||
int attr = compiler.attribute(ATTR_STD_CURVE_LENGTH);
|
||||
compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_OUTPUT_FLOAT);
|
||||
}
|
||||
|
||||
out = output("Thickness");
|
||||
if (!out->links.empty()) {
|
||||
compiler.add_node(NODE_HAIR_INFO, NODE_INFO_CURVE_THICKNESS, compiler.stack_assign(out));
|
||||
@@ -6749,8 +6759,6 @@ void NormalMapNode::attributes(Shader *shader, AttributeRequestSet *attributes)
|
||||
attributes->add(ustring((string(attribute.c_str()) + ".tangent").c_str()));
|
||||
attributes->add(ustring((string(attribute.c_str()) + ".tangent_sign").c_str()));
|
||||
}
|
||||
|
||||
attributes->add(ATTR_STD_VERTEX_NORMAL);
|
||||
}
|
||||
|
||||
ShaderNode::attributes(shader, attributes);
|
||||
@@ -7026,8 +7034,6 @@ void VectorDisplacementNode::attributes(Shader *shader, AttributeRequestSet *att
|
||||
attributes->add(ustring((string(attribute.c_str()) + ".tangent").c_str()));
|
||||
attributes->add(ustring((string(attribute.c_str()) + ".tangent_sign").c_str()));
|
||||
}
|
||||
|
||||
attributes->add(ATTR_STD_VERTEX_NORMAL);
|
||||
}
|
||||
|
||||
ShaderNode::attributes(shader, attributes);
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user