move node_common to c++

Fix: Curve to Mesh node outputs original curve
It should only output the new mesh (and potentially instances that are processed separately).
2021-09-30 13:52:12 +02:00 · 2021-09-30 13:33:21 +02:00 · 2021-09-30 12:51:09 +02:00 · 2021-09-30 12:33:34 +02:00 · 2021-09-30 11:44:53 +02:00 · 2021-09-30 17:45:09 +10:00
817 changed files with 25385 additions and 6144 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -419,6 +419,8 @@ mark_as_advanced(WITH_CYCLES_NATIVE_ONLY)

 option(WITH_CYCLES_DEVICE_CUDA              "Enable Cycles CUDA compute support" ON)
 option(WITH_CYCLES_DEVICE_OPTIX             "Enable Cycles OptiX support" ON)
+option(WITH_CYCLES_DEVICE_HIP               "Enable Cycles HIP support" OFF)
+mark_as_advanced(WITH_CYCLES_DEVICE_HIP)
 mark_as_advanced(WITH_CYCLES_DEVICE_CUDA)

 option(WITH_CUDA_DYNLOAD "Dynamically load CUDA libraries at runtime" ON)
@@ -821,6 +823,11 @@ if(NOT WITH_CUDA_DYNLOAD)
  endif()
 endif()

+if(WITH_CYCLES_DEVICE_HIP)
+  # Currently HIP must be dynamically loaded, this may change in future toolkits
+  set(WITH_HIP_DYNLOAD ON)
+endif()
+
 #-----------------------------------------------------------------------------
 # Check check if submodules are cloned

@@ -1850,6 +1857,9 @@ elseif(WITH_CYCLES_STANDALONE)
  if(WITH_CUDA_DYNLOAD)
    add_subdirectory(extern/cuew)
  endif()
+  if(WITH_HIP_DYNLOAD)
+    add_subdirectory(extern/hipew)
+  endif()
  if(NOT WITH_SYSTEM_GLEW)
    add_subdirectory(extern/glew)
  endif()
--- a/build_files/build_environment/patches/ffmpeg.diff
+++ b/build_files/build_environment/patches/ffmpeg.diff
@@ -70,16 +70,18 @@
 }
 --- a/libavcodec/rl.c
 +++ b/libavcodec/rl.c
-@@ -71,7 +71,7 @@ av_cold void ff_rl_init(RLTable *rl,
+@@ -71,17 +71,19 @@
 av_cold void ff_rl_init_vlc(RLTable *rl, unsigned static_size)
 {
     int i, q;
 -    VLC_TYPE table[1500][2] = {{0}};
 +    VLC_TYPE (*table)[2] = av_calloc(sizeof(VLC_TYPE), 1500 * 2);
     VLC vlc = { .table = table, .table_allocated = static_size };
-     av_assert0(static_size <= FF_ARRAY_ELEMS(table));
+-    av_assert0(static_size <= FF_ARRAY_ELEMS(table));
+    av_assert0(static_size < 1500);
     init_vlc(&vlc, 9, rl->n + 1, &rl->table_vlc[0][1], 4, 2, &rl->table_vlc[0][0], 4, 2, INIT_VLC_USE_NEW_STATIC);
-@@ -80,8 +80,10 @@ av_cold void ff_rl_init_vlc(RLTable *rl, unsigned static_size)
+ 
+     for (q = 0; q < 32; q++) {
         int qmul = q * 2;
         int qadd = (q - 1) | 1;
 
@@ -91,7 +93,7 @@
 
         if (q == 0) {
             qmul = 1;
-@@ -113,4 +115,5 @@ av_cold void ff_rl_init_vlc(RLTable *rl, unsigned static_size)
+@@ -113,4 +115,5 @@
             rl->rl_vlc[q][i].run   = run;
         }
     }
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@@ -67,9 +67,12 @@ endif()

 if(WITH_CYCLES OR WITH_COMPOSITOR OR WITH_OPENSUBDIV)
  add_subdirectory(clew)
-  if(WITH_CUDA_DYNLOAD)
+  if((WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX) AND WITH_CUDA_DYNLOAD)
    add_subdirectory(cuew)
  endif()
+  if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
+    add_subdirectory(hipew)
+  endif()
 endif()

 if(WITH_GHOST_X11 AND WITH_GHOST_XDND)
--- a/extern/audaspace/bindings/C/AUD_Sound.cpp
+++ b/extern/audaspace/bindings/C/AUD_Sound.cpp
@@ -102,26 +102,30 @@ AUD_API int AUD_Sound_getFileStreams(AUD_Sound* sound, AUD_StreamInfo **stream_i

 	if(file)
 	{
-		auto streams = file->queryStreams();
-
-		size_t size = sizeof(AUD_StreamInfo) * streams.size();
-
-		if(!size)
+		try
+		{
+			auto streams = file->queryStreams();
+
+			size_t size = sizeof(AUD_StreamInfo) * streams.size();
+
+			if(!size)
+			{
+				*stream_infos = nullptr;
+				return 0;
+			}
+
+			*stream_infos = reinterpret_cast<AUD_StreamInfo*>(std::malloc(size));
+			std::memcpy(*stream_infos, streams.data(), size);
+
+			return streams.size();
+		}
+		catch(Exception&)
 		{
-			*stream_infos = nullptr;
-			return 0;
 		}
-
-		*stream_infos = reinterpret_cast<AUD_StreamInfo*>(std::malloc(size));
-		std::memcpy(*stream_infos, streams.data(), size);
-
-		return streams.size();
-	}
-	else
-	{
-		*stream_infos = nullptr;
-		return 0;
 	}
+
+	*stream_infos = nullptr;
+	return 0;
 }

 AUD_API sample_t* AUD_Sound_data(AUD_Sound* sound, int* length, AUD_Specs* specs)
--- a/extern/hipew/CMakeLists.txt
+++ b/extern/hipew/CMakeLists.txt
@@ -0,0 +1,39 @@
+# ***** BEGIN GPL LICENSE BLOCK *****
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+#
+# The Original Code is Copyright (C) 2021, Blender Foundation
+# All rights reserved.
+# ***** END GPL LICENSE BLOCK *****
+
+set(INC
+  .
+  include
+)
+
+set(INC_SYS
+
+)
+
+set(SRC
+  src/hipew.c
+
+  include/hipew.h
+)
+
+set(LIB
+)
+
+blender_add_lib(extern_hipew "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
--- a/extern/hipew/include/hipew.h
+++ b/extern/hipew/include/hipew.h
--- a/extern/hipew/src/hipew.c
+++ b/extern/hipew/src/hipew.c
@@ -0,0 +1,533 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License
+ */
+#ifdef _MSC_VER
+#  if _MSC_VER < 1900
+#    define snprintf _snprintf
+#  endif
+#  define popen _popen
+#  define pclose _pclose
+#  define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <hipew.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#ifdef _WIN32
+#  define WIN32_LEAN_AND_MEAN
+#  define VC_EXTRALEAN
+#  include <windows.h>
+
+/* Utility macros. */
+
+typedef HMODULE DynamicLibrary;
+
+#  define dynamic_library_open(path)         LoadLibraryA(path)
+#  define dynamic_library_close(lib)         FreeLibrary(lib)
+#  define dynamic_library_find(lib, symbol)  GetProcAddress(lib, symbol)
+#else
+#  include <dlfcn.h>
+
+typedef void* DynamicLibrary;
+
+#  define dynamic_library_open(path)         dlopen(path, RTLD_NOW)
+#  define dynamic_library_close(lib)         dlclose(lib)
+#  define dynamic_library_find(lib, symbol)  dlsym(lib, symbol)
+#endif
+
+#define _LIBRARY_FIND_CHECKED(lib, name) \
+        name = (t##name *)dynamic_library_find(lib, #name); \
+        assert(name);
+
+#define _LIBRARY_FIND(lib, name) \
+        name = (t##name *)dynamic_library_find(lib, #name);
+
+#define HIP_LIBRARY_FIND_CHECKED(name) \
+        _LIBRARY_FIND_CHECKED(hip_lib, name)
+#define HIP_LIBRARY_FIND(name) _LIBRARY_FIND(hip_lib, name)
+
+
+static DynamicLibrary hip_lib;
+
+/* Function definitions. */
+thipGetErrorName *hipGetErrorName;
+thipInit *hipInit;
+thipDriverGetVersion *hipDriverGetVersion;
+thipGetDevice *hipGetDevice;
+thipGetDeviceCount *hipGetDeviceCount;
+thipDeviceGetName *hipDeviceGetName;
+thipDeviceGetAttribute *hipDeviceGetAttribute;
+thipDeviceComputeCapability *hipDeviceComputeCapability;
+thipDevicePrimaryCtxRetain *hipDevicePrimaryCtxRetain;
+thipDevicePrimaryCtxRelease *hipDevicePrimaryCtxRelease;
+thipDevicePrimaryCtxSetFlags *hipDevicePrimaryCtxSetFlags;
+thipDevicePrimaryCtxGetState *hipDevicePrimaryCtxGetState;
+thipDevicePrimaryCtxReset *hipDevicePrimaryCtxReset;
+thipCtxCreate *hipCtxCreate;
+thipCtxDestroy *hipCtxDestroy;
+thipCtxPushCurrent *hipCtxPushCurrent;
+thipCtxPopCurrent *hipCtxPopCurrent;
+thipCtxSetCurrent *hipCtxSetCurrent;
+thipCtxGetCurrent *hipCtxGetCurrent;
+thipCtxGetDevice *hipCtxGetDevice;
+thipCtxGetFlags *hipCtxGetFlags;
+thipCtxSynchronize *hipCtxSynchronize;
+thipDeviceSynchronize *hipDeviceSynchronize;
+thipCtxGetCacheConfig *hipCtxGetCacheConfig;
+thipCtxSetCacheConfig *hipCtxSetCacheConfig;
+thipCtxGetSharedMemConfig *hipCtxGetSharedMemConfig;
+thipCtxSetSharedMemConfig *hipCtxSetSharedMemConfig;
+thipCtxGetApiVersion *hipCtxGetApiVersion;
+thipModuleLoad *hipModuleLoad;
+thipModuleLoadData *hipModuleLoadData;
+thipModuleLoadDataEx *hipModuleLoadDataEx;
+thipModuleUnload *hipModuleUnload;
+thipModuleGetFunction *hipModuleGetFunction;
+thipModuleGetGlobal *hipModuleGetGlobal;
+thipModuleGetTexRef *hipModuleGetTexRef;
+thipMemGetInfo *hipMemGetInfo;
+thipMalloc *hipMalloc;
+thipMemAllocPitch *hipMemAllocPitch;
+thipFree *hipFree;
+thipMemGetAddressRange *hipMemGetAddressRange;
+thipHostMalloc *hipHostMalloc;
+thipHostFree *hipHostFree;
+thipHostGetDevicePointer *hipHostGetDevicePointer;
+thipHostGetFlags *hipHostGetFlags;
+thipMallocManaged *hipMallocManaged;
+thipDeviceGetByPCIBusId *hipDeviceGetByPCIBusId;
+thipDeviceGetPCIBusId *hipDeviceGetPCIBusId;
+thipMemcpyPeer *hipMemcpyPeer;
+thipMemcpyHtoD *hipMemcpyHtoD;
+thipMemcpyDtoH *hipMemcpyDtoH;
+thipMemcpyDtoD *hipMemcpyDtoD;
+thipDrvMemcpy2DUnaligned *hipDrvMemcpy2DUnaligned;
+thipMemcpyParam2D *hipMemcpyParam2D;
+thipDrvMemcpy3D *hipDrvMemcpy3D;
+thipMemcpyHtoDAsync *hipMemcpyHtoDAsync;
+thipMemcpyDtoHAsync *hipMemcpyDtoHAsync;
+thipMemcpyParam2DAsync *hipMemcpyParam2DAsync;
+thipDrvMemcpy3DAsync *hipDrvMemcpy3DAsync;
+thipMemsetD8 *hipMemsetD8;
+thipMemsetD16 *hipMemsetD16;
+thipMemsetD32 *hipMemsetD32;
+thipMemsetD8Async *hipMemsetD8Async;
+thipMemsetD16Async *hipMemsetD16Async;
+thipMemsetD32Async *hipMemsetD32Async;
+thipArrayCreate *hipArrayCreate;
+thipArrayDestroy *hipArrayDestroy;
+thipArray3DCreate *hipArray3DCreate;
+thipStreamCreateWithFlags *hipStreamCreateWithFlags;
+thipStreamCreateWithPriority *hipStreamCreateWithPriority;
+thipStreamGetPriority *hipStreamGetPriority;
+thipStreamGetFlags *hipStreamGetFlags;
+thipStreamWaitEvent *hipStreamWaitEvent;
+thipStreamAddCallback *hipStreamAddCallback;
+thipStreamQuery *hipStreamQuery;
+thipStreamSynchronize *hipStreamSynchronize;
+thipStreamDestroy *hipStreamDestroy;
+thipEventCreateWithFlags *hipEventCreateWithFlags;
+thipEventRecord *hipEventRecord;
+thipEventQuery *hipEventQuery;
+thipEventSynchronize *hipEventSynchronize;
+thipEventDestroy *hipEventDestroy;
+thipEventElapsedTime *hipEventElapsedTime;
+thipFuncGetAttribute *hipFuncGetAttribute;
+thipFuncSetCacheConfig *hipFuncSetCacheConfig;
+thipModuleLaunchKernel *hipModuleLaunchKernel;
+thipDrvOccupancyMaxActiveBlocksPerMultiprocessor *hipDrvOccupancyMaxActiveBlocksPerMultiprocessor;
+thipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags *hipDrvOccupancyMaxActiveBlocksPerMultiprocessorWithFlags;
+thipModuleOccupancyMaxPotentialBlockSize *hipModuleOccupancyMaxPotentialBlockSize;
+thipTexRefSetArray *hipTexRefSetArray;
+thipTexRefSetAddress *hipTexRefSetAddress;
+thipTexRefSetAddress2D *hipTexRefSetAddress2D;
+thipTexRefSetFormat *hipTexRefSetFormat;
+thipTexRefSetAddressMode *hipTexRefSetAddressMode;
+thipTexRefSetFilterMode *hipTexRefSetFilterMode;
+thipTexRefSetFlags *hipTexRefSetFlags;
+thipTexRefGetAddress *hipTexRefGetAddress;
+thipTexRefGetArray *hipTexRefGetArray;
+thipTexRefGetAddressMode *hipTexRefGetAddressMode;
+thipTexObjectCreate *hipTexObjectCreate;
+thipTexObjectDestroy *hipTexObjectDestroy;
+thipDeviceCanAccessPeer *hipDeviceCanAccessPeer;
+
+thipCtxEnablePeerAccess *hipCtxEnablePeerAccess;
+thipCtxDisablePeerAccess *hipCtxDisablePeerAccess;
+thipDeviceGetP2PAttribute *hipDeviceGetP2PAttribute;
+thipGraphicsUnregisterResource *hipGraphicsUnregisterResource;
+thipGraphicsMapResources *hipGraphicsMapResources;
+thipGraphicsUnmapResources *hipGraphicsUnmapResources;
+thipGraphicsResourceGetMappedPointer *hipGraphicsResourceGetMappedPointer;
+
+thipGraphicsGLRegisterBuffer *hipGraphicsGLRegisterBuffer;
+thipGLGetDevices *hipGLGetDevices;
+
+
+
+static DynamicLibrary dynamic_library_open_find(const char **paths) {
+  int i = 0;
+  while (paths[i] != NULL) {
+      DynamicLibrary lib = dynamic_library_open(paths[i]);
+      if (lib != NULL) {
+        return lib;
+      }
+      ++i;
+  }
+  return NULL;
+}
+
+/* Implementation function. */
+static void hipewHipExit(void) {
+  if (hip_lib != NULL) {
+    /*  Ignore errors. */
+    dynamic_library_close(hip_lib);
+    hip_lib = NULL;
+  }
+}
+
+static int hipewHipInit(void) {
+  /* Library paths. */
+#ifdef _WIN32
+  /* Expected in c:/windows/system or similar, no path needed. */
+  const char *hip_paths[] = {"amdhip64.dll", NULL};
+#elif defined(__APPLE__)
+  /* Default installation path. */
+  const char *hip_paths[] = {"", NULL};
+#else
+  const char *hip_paths[] = {"/opt/rocm/hip/lib/libamdhip64.so", NULL};
+#endif
+  static int initialized = 0;
+  static int result = 0;
+  int error, driver_version;
+
+  if (initialized) {
+    return result;
+  }
+
+  initialized = 1;
+
+  error = atexit(hipewHipExit);
+  if (error) {
+    result = HIPEW_ERROR_ATEXIT_FAILED;
+    return result;
+  }
+
+  /* Load library. */
+  hip_lib = dynamic_library_open_find(hip_paths);
+
+  if (hip_lib == NULL) {
+    result = HIPEW_ERROR_OPEN_FAILED;
+    return result;
+  }
+
+  /* Fetch all function pointers. */
+  HIP_LIBRARY_FIND_CHECKED(hipGetErrorName);
+  HIP_LIBRARY_FIND_CHECKED(hipInit);
+  HIP_LIBRARY_FIND_CHECKED(hipDriverGetVersion);
+  HIP_LIBRARY_FIND_CHECKED(hipGetDevice);
+  HIP_LIBRARY_FIND_CHECKED(hipGetDeviceCount);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetName);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetAttribute);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceComputeCapability);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxRetain);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxRelease);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxSetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxGetState);
+  HIP_LIBRARY_FIND_CHECKED(hipDevicePrimaryCtxReset);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxPushCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxPopCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSetCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetCurrent);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetDevice);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetCacheConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSetCacheConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetSharedMemConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxSetSharedMemConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxGetApiVersion);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLoad);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLoadData);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLoadDataEx);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleUnload);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleGetFunction);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleGetGlobal);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleGetTexRef);
+  HIP_LIBRARY_FIND_CHECKED(hipMemGetInfo);
+  HIP_LIBRARY_FIND_CHECKED(hipMalloc);
+  HIP_LIBRARY_FIND_CHECKED(hipMemAllocPitch);
+  HIP_LIBRARY_FIND_CHECKED(hipFree);
+  HIP_LIBRARY_FIND_CHECKED(hipMemGetAddressRange);
+  HIP_LIBRARY_FIND_CHECKED(hipHostMalloc);
+  HIP_LIBRARY_FIND_CHECKED(hipHostFree);
+  HIP_LIBRARY_FIND_CHECKED(hipHostGetDevicePointer);
+  HIP_LIBRARY_FIND_CHECKED(hipHostGetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipMallocManaged);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetByPCIBusId);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetPCIBusId);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyPeer);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyHtoD);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoH);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoD);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyParam2D);
+  HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy3D);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyHtoDAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyDtoHAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy2DUnaligned);
+  HIP_LIBRARY_FIND_CHECKED(hipMemcpyParam2DAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipDrvMemcpy3DAsync);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD8);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD16);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD32);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD8Async);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD16Async);
+  HIP_LIBRARY_FIND_CHECKED(hipMemsetD32Async);
+  HIP_LIBRARY_FIND_CHECKED(hipArrayCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipArrayDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipArray3DCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamCreateWithFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamCreateWithPriority);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamGetPriority);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamGetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamWaitEvent);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamAddCallback);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamQuery);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipStreamDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipEventCreateWithFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipEventRecord);
+  HIP_LIBRARY_FIND_CHECKED(hipEventQuery);
+  HIP_LIBRARY_FIND_CHECKED(hipEventSynchronize);
+  HIP_LIBRARY_FIND_CHECKED(hipEventDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipEventElapsedTime);
+  HIP_LIBRARY_FIND_CHECKED(hipFuncGetAttribute);
+  HIP_LIBRARY_FIND_CHECKED(hipFuncSetCacheConfig);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleLaunchKernel);
+  HIP_LIBRARY_FIND_CHECKED(hipModuleOccupancyMaxPotentialBlockSize);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetArray);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddress);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddress2D);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFormat);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetAddressMode);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFilterMode);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefSetFlags);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefGetAddress);
+  HIP_LIBRARY_FIND_CHECKED(hipTexRefGetAddressMode);
+  HIP_LIBRARY_FIND_CHECKED(hipTexObjectCreate);
+  HIP_LIBRARY_FIND_CHECKED(hipTexObjectDestroy);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceCanAccessPeer);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxEnablePeerAccess);
+  HIP_LIBRARY_FIND_CHECKED(hipCtxDisablePeerAccess);
+  HIP_LIBRARY_FIND_CHECKED(hipDeviceGetP2PAttribute);
+#ifdef _WIN32
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsUnregisterResource);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsMapResources);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsUnmapResources);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsResourceGetMappedPointer);
+  HIP_LIBRARY_FIND_CHECKED(hipGraphicsGLRegisterBuffer);
+  HIP_LIBRARY_FIND_CHECKED(hipGLGetDevices);
+#endif
+  result = HIPEW_SUCCESS;
+  return result;
+}
+
+
+
+int hipewInit(hipuint32_t flags) {
+  int result = HIPEW_SUCCESS;
+
+  if (flags & HIPEW_INIT_HIP) {
+    result = hipewHipInit();
+    if (result != HIPEW_SUCCESS) {
+      return result;
+    }
+  }
+
+  return result;
+}
+
+
+const char *hipewErrorString(hipError_t result) {
+  switch (result) {
+    case hipSuccess: return "No errors";
+    case hipErrorInvalidValue: return "Invalid value";
+    case hipErrorOutOfMemory: return "Out of memory";
+    case hipErrorNotInitialized: return "Driver not initialized";
+    case hipErrorDeinitialized: return "Driver deinitialized";
+    case hipErrorProfilerDisabled: return "Profiler disabled";
+    case hipErrorProfilerNotInitialized: return "Profiler not initialized";
+    case hipErrorProfilerAlreadyStarted: return "Profiler already started";
+    case hipErrorProfilerAlreadyStopped: return "Profiler already stopped";
+    case hipErrorNoDevice: return "No HIP-capable device available";
+    case hipErrorInvalidDevice: return "Invalid device";
+    case hipErrorInvalidImage: return "Invalid kernel image";
+    case hipErrorInvalidContext: return "Invalid context";
+    case hipErrorContextAlreadyCurrent: return "Context already current";
+    case hipErrorMapFailed: return "Map failed";
+    case hipErrorUnmapFailed: return "Unmap failed";
+    case hipErrorArrayIsMapped: return "Array is mapped";
+    case hipErrorAlreadyMapped: return "Already mapped";
+    case hipErrorNoBinaryForGpu: return "No binary for GPU";
+    case hipErrorAlreadyAcquired: return "Already acquired";
+    case hipErrorNotMapped: return "Not mapped";
+    case hipErrorNotMappedAsArray: return "Mapped resource not available for access as an array";
+    case hipErrorNotMappedAsPointer: return "Mapped resource not available for access as a pointer";
+    case hipErrorECCNotCorrectable: return "Uncorrectable ECC error detected";
+    case hipErrorUnsupportedLimit: return "hipLimit_t not supported by device";
+    case hipErrorContextAlreadyInUse: return "Context already in use";
+    case hipErrorPeerAccessUnsupported: return "Peer access unsupported";
+    case hipErrorInvalidKernelFile: return "Invalid ptx";
+    case hipErrorInvalidGraphicsContext: return "Invalid graphics context";
+    case hipErrorInvalidSource: return "Invalid source";
+    case hipErrorFileNotFound: return "File not found";
+    case hipErrorSharedObjectSymbolNotFound: return "Link to a shared object failed to resolve";
+    case hipErrorSharedObjectInitFailed: return "Shared object initialization failed";
+    case hipErrorOperatingSystem: return "Operating system";
+    case hipErrorInvalidHandle: return "Invalid handle";
+    case hipErrorNotFound: return "Not found";
+    case hipErrorNotReady: return "HIP not ready";
+    case hipErrorIllegalAddress: return "Illegal address";
+    case hipErrorLaunchOutOfResources: return "Launch exceeded resources";
+    case hipErrorLaunchTimeOut: return "Launch exceeded timeout";
+    case hipErrorPeerAccessAlreadyEnabled: return "Peer access already enabled";
+    case hipErrorPeerAccessNotEnabled: return "Peer access not enabled";
+    case hipErrorSetOnActiveProcess: return "Primary context active";
+    case hipErrorAssert: return "Assert";
+    case hipErrorHostMemoryAlreadyRegistered: return "Host memory already registered";
+    case hipErrorHostMemoryNotRegistered: return "Host memory not registered";
+    case hipErrorLaunchFailure: return "Launch failed";
+    case hipErrorCooperativeLaunchTooLarge: return "Cooperative launch too large";
+    case hipErrorNotSupported: return "Not supported";
+    case hipErrorUnknown: return "Unknown error";
+    default: return "Unknown HIP error value";
+  }
+}
+
+static void path_join(const char *path1,
+                      const char *path2,
+                      int maxlen,
+                      char *result) {
+#if defined(WIN32) || defined(_WIN32)
+  const char separator = '\\';
+#else
+  const char separator = '/';
+#endif
+  int n = snprintf(result, maxlen, "%s%c%s", path1, separator, path2);
+  if (n != -1 && n < maxlen) {
+    result[n] = '\0';
+  }
+  else {
+    result[maxlen - 1] = '\0';
+  }
+}
+
+static int path_exists(const char *path) {
+  struct stat st;
+  if (stat(path, &st)) {
+    return 0;
+  }
+  return 1;
+}
+
+const char *hipewCompilerPath(void) {
+    #ifdef _WIN32
+    const char *hipPath = getenv("HIP_ROCCLR_HOME");
+    const char *windowsCommand = "perl ";
+    const char *executable = "bin/hipcc";
+
+    static char hipcc[65536];
+    static char finalCommand[65536];
+    if(hipPath) {
+      path_join(hipPath, executable, sizeof(hipcc), hipcc);
+      if(path_exists(hipcc)) {
+        snprintf(finalCommand, sizeof(hipcc), "%s %s", windowsCommand, hipcc);
+        return finalCommand;
+      } else {
+        printf("Could not find hipcc. Make sure HIP_ROCCLR_HOME points to the directory holding /bin/hipcc");
+      }
+    }
+    #else
+    const char *hipPath =  "opt/rocm/hip/bin";
+    const char *executable = "hipcc";
+
+    static char hipcc[65536];
+    if(hipPath) {
+      path_join(hipPath, executable, sizeof(hipcc), hipcc);
+      if(path_exists(hipcc)){
+        return hipcc;
+      }
+    }
+    #endif
+
+  {
+#ifdef _WIN32
+    FILE *handle = popen("where hipcc", "r");
+#else
+    FILE *handle = popen("which hipcc", "r");
+#endif
+    if (handle) {
+      char buffer[4096] = {0};
+      int len = fread(buffer, 1, sizeof(buffer) - 1, handle);
+      buffer[len] = '\0';
+      pclose(handle);
+      if (buffer[0]) {
+        return "hipcc";
+      }
+    }
+  }
+
+  return NULL;
+}
+
+int hipewCompilerVersion(void) {
+  const char *path = hipewCompilerPath();
+  const char *marker = "Hip compilation tools, release ";
+  FILE *pipe;
+  int major, minor;
+  char *versionstr;
+  char buf[128];
+  char output[65536] = "\0";
+  char command[65536] = "\0";
+
+  if (path == NULL) {
+    return 0;
+  }
+
+  /* get --version output */
+  strcat(command, "\"");
+  strncat(command, path, sizeof(command) - 1);
+  strncat(command, "\" --version", sizeof(command) - strlen(path) - 1);
+  pipe = popen(command, "r");
+  if (!pipe) {
+    fprintf(stderr, "HIP: failed to run compiler to retrieve version");
+    return 0;
+  }
+
+  while (!feof(pipe)) {
+    if (fgets(buf, sizeof(buf), pipe) != NULL) {
+      strncat(output, buf, sizeof(output) - strlen(output) - 1);
+    }
+  }
+
+  pclose(pipe);
+  return 40;
+}
--- a/extern/tinygltf/README.blender
+++ b/extern/tinygltf/README.blender
@@ -2,4 +2,5 @@ Project: TinyGLTF
 URL: https://github.com/syoyo/tinygltf
 License: MIT
 Upstream version: 2.5.0, 19a41d20ec0
-Local modifications: None
+Local modifications: 
+* Silence "enum value not handled in switch" warnings due to JSON dependency.
--- a/extern/tinygltf/tiny_gltf.h
+++ b/extern/tinygltf/tiny_gltf.h
@@ -3201,6 +3201,7 @@ static bool ParseJsonAsValue(Value *ret, const json &o) {
      val = Value(o.get<double>());
      break;
    case json::value_t::null:
+    case json::value_t::binary:
    case json::value_t::discarded:
      // default:
      break;
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -297,6 +297,7 @@ endif()

 if(WITH_CYCLES_STANDALONE)
  set(WITH_CYCLES_DEVICE_CUDA TRUE)
+  set(WITH_CYCLES_DEVICE_HIP TRUE)
 endif()
 # TODO(sergey): Consider removing it, only causes confusion in interface.
 set(WITH_CYCLES_DEVICE_MULTI TRUE)
--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@ -53,7 +53,7 @@ struct Options {
  SessionParams session_params;
  bool quiet;
  bool show_help, interactive, pause;
-  string output_path;
+  string output_filepath;
 } options;

 static void session_print(const string &str)
@@ -160,7 +160,7 @@ static void session_init()
  /* load scene */
  scene_init();

-  options.session->reset(session_buffer_params(), options.session_params.samples);
+  options.session->reset(options.session_params, session_buffer_params());
  options.session->start();
 }

@@ -222,9 +222,7 @@ static void display_info(Progress &progress)

 static void display()
 {
-  static DeviceDrawParams draw_params = DeviceDrawParams();
-
-  options.session->draw(session_buffer_params(), draw_params);
+  options.session->draw();

  display_info(options.session->progress);
 }
@@ -254,7 +252,7 @@ static void motion(int x, int y, int button)
    options.session->scene->camera->need_flags_update = true;
    options.session->scene->camera->need_device_update = true;

-    options.session->reset(session_buffer_params(), options.session_params.samples);
+    options.session->reset(options.session_params, session_buffer_params());
  }
 }

@@ -271,7 +269,7 @@ static void resize(int width, int height)
    options.session->scene->camera->need_flags_update = true;
    options.session->scene->camera->need_device_update = true;

-    options.session->reset(session_buffer_params(), options.session_params.samples);
+    options.session->reset(options.session_params, session_buffer_params());
  }
 }

@@ -283,7 +281,7 @@ static void keyboard(unsigned char key)

  /* Reset */
  else if (key == 'r')
-    options.session->reset(session_buffer_params(), options.session_params.samples);
+    options.session->reset(options.session_params, session_buffer_params());

  /* Cancel */
  else if (key == 27)  // escape
@@ -320,7 +318,7 @@ static void keyboard(unsigned char key)
    options.session->scene->camera->need_flags_update = true;
    options.session->scene->camera->need_device_update = true;

-    options.session->reset(session_buffer_params(), options.session_params.samples);
+    options.session->reset(options.session_params, session_buffer_params());
  }

  /* Set Max Bounces */
@@ -346,7 +344,7 @@ static void keyboard(unsigned char key)

    options.session->scene->integrator->set_max_bounce(bounce);

-    options.session->reset(session_buffer_params(), options.session_params.samples);
+    options.session->reset(options.session_params, session_buffer_params());
  }
 }
 #endif
@@ -361,11 +359,13 @@ static int files_parse(int argc, const char *argv[])

 static void options_parse(int argc, const char **argv)
 {
-  options.width = 0;
-  options.height = 0;
+  options.width = 1024;
+  options.height = 512;
  options.filepath = "";
  options.session = NULL;
  options.quiet = false;
+  options.session_params.use_auto_tile = false;
+  options.session_params.tile_size = 0;

  /* device names */
  string device_names = "";
@@ -411,7 +411,7 @@ static void options_parse(int argc, const char **argv)
             &options.session_params.samples,
             "Number of samples to render",
             "--output %s",
-             &options.output_path,
+             &options.output_filepath,
             "File path to write output image",
             "--threads %d",
             &options.session_params.threads,
@@ -422,12 +422,9 @@ static void options_parse(int argc, const char **argv)
             "--height %d",
             &options.height,
             "Window height in pixel",
-             "--tile-width %d",
-             &options.session_params.tile_size.x,
-             "Tile width in pixels",
-             "--tile-height %d",
-             &options.session_params.tile_size.y,
-             "Tile height in pixels",
+             "--tile-size %d",
+             &options.session_params.tile_size,
+             "Tile size in pixels",
             "--list-devices",
             &list,
             "List information about all available devices",
@@ -489,8 +486,9 @@ static void options_parse(int argc, const char **argv)
  options.session_params.background = true;
 #endif

-  /* Use progressive rendering */
-  options.session_params.progressive = true;
+  if (options.session_params.tile_size > 0) {
+    options.session_params.use_auto_tile = true;
+  }

  /* find matching device */
  DeviceType device_type = Device::type_from_string(devicename.c_str());
--- a/intern/cycles/blender/CMakeLists.txt
+++ b/intern/cycles/blender/CMakeLists.txt
@@ -95,6 +95,9 @@ set(ADDON_FILES

 add_definitions(${GL_DEFINITIONS})

+if(WITH_CYCLES_DEVICE_HIP)
+  add_definitions(-DWITH_HIP)
+endif()
 if(WITH_MOD_FLUID)
  add_definitions(-DWITH_FLUID)
 endif()
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -28,7 +28,7 @@ def _configure_argument_parser():
                        action='store_true')
    parser.add_argument("--cycles-device",
                        help="Set the device to use for Cycles, overriding user preferences and the scene setting."
-                             "Valid options are 'CPU', 'CUDA' or 'OPTIX'."
+                             "Valid options are 'CPU', 'CUDA', 'OPTIX', or 'HIP'"
                             "Additionally, you can append '+CPU' to any GPU type for hybrid rendering.",
                        default=None)
    return parser
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -111,6 +111,7 @@ enum_device_type = (
    ('CPU', "CPU", "CPU", 0),
    ('CUDA', "CUDA", "CUDA", 1),
    ('OPTIX', "OptiX", "OptiX", 3),
+    ("HIP", "HIP", "HIP", 4)
 )

 enum_texture_limit = (
@@ -739,14 +740,14 @@ class CyclesRenderSettings(bpy.types.PropertyGroup):

    use_auto_tile: BoolProperty(
        name="Auto Tiles",
-        description="Automatically split image into tiles",
+        description="Automatically render high resolution images in tiles to reduce memory usage, using the specified tile size. Tiles are cached to disk while rendering to save memory",
        default=True,
    )
    tile_size: IntProperty(
        name="Tile Size",
        default=2048,
        description="",
-        min=0, max=16384,
+        min=8, max=16384,
    )

    # Various fine-tuning debug flags
@@ -1266,12 +1267,16 @@ class CyclesPreferences(bpy.types.AddonPreferences):

    def get_device_types(self, context):
        import _cycles
-        has_cuda, has_optix = _cycles.get_device_types()
+        has_cuda, has_optix, has_hip = _cycles.get_device_types()
+
        list = [('NONE', "None", "Don't use compute device", 0)]
        if has_cuda:
            list.append(('CUDA', "CUDA", "Use CUDA for GPU acceleration", 1))
        if has_optix:
            list.append(('OPTIX', "OptiX", "Use OptiX for GPU acceleration", 3))
+        if has_hip:
+            list.append(('HIP', "HIP", "Use HIP for GPU acceleration", 4))
+
        return list

    compute_device_type: EnumProperty(
@@ -1296,7 +1301,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):

    def update_device_entries(self, device_list):
        for device in device_list:
-            if not device[1] in {'CUDA', 'OPTIX', 'CPU'}:
+            if not device[1] in {'CUDA', 'OPTIX', 'CPU', 'HIP'}:
                continue
            # Try to find existing Device entry
            entry = self.find_existing_device_entry(device)
@@ -1330,7 +1335,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
            elif entry.type == 'CPU':
                cpu_devices.append(entry)
        # Extend all GPU devices with CPU.
-        if compute_device_type != 'CPU':
+        if compute_device_type != 'CPU' and compute_device_type != 'HIP':
            devices.extend(cpu_devices)
        return devices

@@ -1340,7 +1345,7 @@ class CyclesPreferences(bpy.types.AddonPreferences):
        import _cycles
        # Ensure `self.devices` is not re-allocated when the second call to
        # get_devices_for_type is made, freeing items from the first list.
-        for device_type in ('CUDA', 'OPTIX', 'OPENCL'):
+        for device_type in ('CUDA', 'OPTIX', 'HIP'):
            self.update_device_entries(_cycles.available_devices(device_type))

    # Deprecated: use refresh_devices instead.
--- a/intern/cycles/blender/addon/ui.py
+++ b/intern/cycles/blender/addon/ui.py
@@ -99,6 +99,11 @@ def use_cuda(context):
    return (get_device_type(context) == 'CUDA' and cscene.device == 'GPU')


+def use_hip(context):
+    cscene = context.scene.cycles
+
+    return (get_device_type(context) == 'HIP' and cscene.device == 'GPU')
+
 def use_optix(context):
    cscene = context.scene.cycles

@@ -613,8 +618,8 @@ class CYCLES_RENDER_PT_performance_threads(CyclesButtonsPanel, Panel):
        sub.prop(rd, "threads")


-class CYCLES_RENDER_PT_performance_tiles(CyclesButtonsPanel, Panel):
-    bl_label = "Tiles"
+class CYCLES_RENDER_PT_performance_memory(CyclesButtonsPanel, Panel):
+    bl_label = "Memory"
    bl_parent_id = "CYCLES_RENDER_PT_performance"

    def draw(self, context):
@@ -2107,7 +2112,7 @@ classes = (
    CYCLES_RENDER_PT_film_transparency,
    CYCLES_RENDER_PT_performance,
    CYCLES_RENDER_PT_performance_threads,
-    CYCLES_RENDER_PT_performance_tiles,
+    CYCLES_RENDER_PT_performance_memory,
    CYCLES_RENDER_PT_performance_acceleration_structure,
    CYCLES_RENDER_PT_performance_final_render,
    CYCLES_RENDER_PT_performance_viewport,
--- a/intern/cycles/blender/blender_curves.cpp
+++ b/intern/cycles/blender/blender_curves.cpp
@@ -283,10 +283,13 @@ static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CDa
    return;

  Attribute *attr_intercept = NULL;
+  Attribute *attr_length = NULL;
  Attribute *attr_random = NULL;

  if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT))
    attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT);
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_LENGTH))
+    attr_length = hair->attributes.add(ATTR_STD_CURVE_LENGTH);
  if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM))
    attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM);

@@ -336,6 +339,10 @@ static void ExportCurveSegments(Scene *scene, Hair *hair, ParticleCurveData *CDa
        num_curve_keys++;
      }

+      if (attr_length != NULL) {
+        attr_length->add(CData->curve_length[curve]);
+      }
+
      if (attr_random != NULL) {
        attr_random->add(hash_uint2_to_float(num_curves, 0));
      }
@@ -657,11 +664,15 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)

  /* Add requested attributes. */
  Attribute *attr_intercept = NULL;
+  Attribute *attr_length = NULL;
  Attribute *attr_random = NULL;

  if (hair->need_attribute(scene, ATTR_STD_CURVE_INTERCEPT)) {
    attr_intercept = hair->attributes.add(ATTR_STD_CURVE_INTERCEPT);
  }
+  if (hair->need_attribute(scene, ATTR_STD_CURVE_LENGTH)) {
+    attr_length = hair->attributes.add(ATTR_STD_CURVE_LENGTH);
+  }
  if (hair->need_attribute(scene, ATTR_STD_CURVE_RANDOM)) {
    attr_random = hair->attributes.add(ATTR_STD_CURVE_RANDOM);
  }
@@ -714,6 +725,10 @@ static void export_hair_curves(Scene *scene, Hair *hair, BL::Hair b_hair)
      }
    }

+    if (attr_length) {
+      attr_length->add(length);
+    }
+
    /* Random number per curve. */
    if (attr_random != NULL) {
      attr_random->add(hash_uint2_to_float(b_curve.index(), 0));
--- a/intern/cycles/blender/blender_device.cpp
+++ b/intern/cycles/blender/blender_device.cpp
@@ -26,6 +26,7 @@ enum ComputeDevice {
  COMPUTE_DEVICE_CPU = 0,
  COMPUTE_DEVICE_CUDA = 1,
  COMPUTE_DEVICE_OPTIX = 3,
+  COMPUTE_DEVICE_HIP = 4,

  COMPUTE_DEVICE_NUM
 };
@@ -81,6 +82,9 @@ DeviceInfo blender_device_info(BL::Preferences &b_preferences, BL::Scene &b_scen
      else if (compute_device == COMPUTE_DEVICE_OPTIX) {
        mask |= DEVICE_MASK_OPTIX;
      }
+      else if (compute_device == COMPUTE_DEVICE_HIP) {
+        mask |= DEVICE_MASK_HIP;
+      }
      vector<DeviceInfo> devices = Device::available_devices(mask);

      /* Match device preferences and available devices. */
--- a/intern/cycles/blender/blender_geometry.cpp
+++ b/intern/cycles/blender/blender_geometry.cpp
@@ -80,8 +80,10 @@ Geometry *BlenderSync::sync_geometry(BL::Depsgraph &b_depsgraph,
 {
  /* Test if we can instance or if the object is modified. */
  Geometry::Type geom_type = determine_geom_type(b_ob_info, use_particle_hair);
-  BL::ID b_key_id = (BKE_object_is_modified(b_ob_info.real_object)) ? b_ob_info.real_object :
-                                                                      b_ob_info.object_data;
+  BL::ID b_key_id = (b_ob_info.is_real_object_data() &&
+                     BKE_object_is_modified(b_ob_info.real_object)) ?
+                        b_ob_info.real_object :
+                        b_ob_info.object_data;
  GeometryKey key(b_key_id.ptr.data, geom_type);

  /* Find shader indices. */
--- a/intern/cycles/blender/blender_gpu_display.cpp
+++ b/intern/cycles/blender/blender_gpu_display.cpp
@@ -485,12 +485,6 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
  /* See do_update_begin() for why no locking is required here. */
  const bool transparent = true;  // TODO(sergey): Derive this from Film.

-  if (texture_.need_clear) {
-    /* Texture is requested to be cleared and was not yet cleared.
-     * Do early return which should be equivalent of drawing all-zero texture. */
-    return;
-  }
-
  if (!gl_draw_resources_ensure()) {
    return;
  }
@@ -499,6 +493,16 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
    gl_context_mutex_.lock();
  }

+  if (texture_.need_clear) {
+    /* Texture is requested to be cleared and was not yet cleared.
+     *
+     * Do early return which should be equivalent of drawing all-zero texture.
+     * Watchout for the lock though so that the clear happening during update is properly
+     * synchronized here. */
+    gl_context_mutex_.unlock();
+    return;
+  }
+
  if (gl_upload_sync_) {
    glWaitSync((GLsync)gl_upload_sync_, 0, GL_TIMEOUT_IGNORED);
  }
@@ -524,7 +528,7 @@ void BlenderGPUDisplay::do_draw(const GPUDisplayParams &params)
  const float zoomed_width = params.size.x * zoom_.x;
  const float zoomed_height = params.size.y * zoom_.y;
  if (texture_.width != params.size.x || texture_.height != params.size.y) {
-    /* Resolution divider is different from 1, force enarest interpolation. */
+    /* Resolution divider is different from 1, force nearest interpolation. */
    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
  }
  else if (zoomed_width - params.size.x > 0.5f || zoomed_height - params.size.y > 0.5f) {
--- a/intern/cycles/blender/blender_python.cpp
+++ b/intern/cycles/blender/blender_python.cpp
@@ -911,14 +911,16 @@ static PyObject *enable_print_stats_func(PyObject * /*self*/, PyObject * /*args*
 static PyObject *get_device_types_func(PyObject * /*self*/, PyObject * /*args*/)
 {
  vector<DeviceType> device_types = Device::available_types();
-  bool has_cuda = false, has_optix = false;
+  bool has_cuda = false, has_optix = false, has_hip = false;
  foreach (DeviceType device_type, device_types) {
    has_cuda |= (device_type == DEVICE_CUDA);
    has_optix |= (device_type == DEVICE_OPTIX);
+    has_hip |= (device_type == DEVICE_HIP);
  }
-  PyObject *list = PyTuple_New(2);
+  PyObject *list = PyTuple_New(3);
  PyTuple_SET_ITEM(list, 0, PyBool_FromLong(has_cuda));
  PyTuple_SET_ITEM(list, 1, PyBool_FromLong(has_optix));
+  PyTuple_SET_ITEM(list, 2, PyBool_FromLong(has_hip));
  return list;
 }

@@ -944,6 +946,9 @@ static PyObject *set_device_override_func(PyObject * /*self*/, PyObject *arg)
  else if (override == "OPTIX") {
    BlenderSession::device_override = DEVICE_MASK_OPTIX;
  }
+  else if (override == "HIP") {
+    BlenderSession::device_override = DEVICE_MASK_HIP;
+  }
  else {
    printf("\nError: %s is not a valid Cycles device.\n", override.c_str());
    Py_RETURN_FALSE;
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -71,7 +71,8 @@ BlenderSession::BlenderSession(BL::RenderEngine &b_engine,
      width(0),
      height(0),
      preview_osl(preview_osl),
-      python_thread_state(NULL)
+      python_thread_state(NULL),
+      use_developer_ui(false)
 {
  /* offline render */
  background = true;
@@ -311,6 +312,8 @@ void BlenderSession::read_render_tile()
  for (BL::RenderPass &b_pass : b_rlay.passes) {
    session->set_render_tile_pixels(b_pass.name(), b_pass.channels(), (float *)b_pass.rect());
  }
+
+  b_engine.end_result(b_rr, false, false, false);
 }

 void BlenderSession::write_render_tile()
@@ -557,6 +560,11 @@ void BlenderSession::render(BL::Depsgraph &b_depsgraph_)
  /* free result without merging */
  b_engine.end_result(b_rr, true, false, false);

+  /* When tiled rendering is used there will be no "write" done for the tile. Forcefully clear
+   * highlighted tiles now, so that the highlight will be removed while processing full frame from
+   * file. */
+  b_engine.tile_highlight_clear_all();
+
  double total_time, render_time;
  session->progress.get_time(total_time, render_time);
  VLOG(1) << "Total render time: " << total_time;
@@ -581,6 +589,12 @@ void BlenderSession::render_frame_finish()

  for (string_view filename : full_buffer_files_) {
    session->process_full_buffer_from_disk(filename);
+    if (check_and_report_session_error()) {
+      break;
+    }
+  }
+
+  for (string_view filename : full_buffer_files_) {
    path_remove(filename);
  }

@@ -988,8 +1002,9 @@ void BlenderSession::update_status_progress()
  get_status(status, substatus);
  get_progress(progress, total_time, render_time);

-  if (progress > 0)
-    remaining_time = (1.0 - (double)progress) * (render_time / (double)progress);
+  if (progress > 0) {
+    remaining_time = session->get_estimated_remaining_time();
+  }

  if (background) {
    if (scene)
@@ -1027,20 +1042,27 @@ void BlenderSession::update_status_progress()
    last_progress = progress;
  }

-  if (session->progress.get_error()) {
-    string error = session->progress.get_error_message();
-    if (error != last_error) {
-      /* TODO(sergey): Currently C++ RNA API doesn't let us to
-       * use mnemonic name for the variable. Would be nice to
-       * have this figured out.
-       *
-       * For until then, 1 << 5 means RPT_ERROR.
-       */
-      b_engine.report(1 << 5, error.c_str());
-      b_engine.error_set(error.c_str());
-      last_error = error;
-    }
+  check_and_report_session_error();
+}
+
+bool BlenderSession::check_and_report_session_error()
+{
+  if (!session->progress.get_error()) {
+    return false;
  }
+
+  const string error = session->progress.get_error_message();
+  if (error != last_error) {
+    /* TODO(sergey): Currently C++ RNA API doesn't let us to use mnemonic name for the variable.
+     * Would be nice to have this figured out.
+     *
+     * For until then, 1 << 5 means RPT_ERROR. */
+    b_engine.report(1 << 5, error.c_str());
+    b_engine.error_set(error.c_str());
+    last_error = error;
+  }
+
+  return true;
 }

 void BlenderSession::tag_update()
--- a/intern/cycles/blender/blender_session.h
+++ b/intern/cycles/blender/blender_session.h
@@ -110,8 +110,7 @@ class BlenderSession {
  BL::RenderSettings b_render;
  BL::Depsgraph b_depsgraph;
  /* NOTE: Blender's scene might become invalid after call
-   * free_blender_memory_if_possible().
-   */
+   * #free_blender_memory_if_possible(). */
  BL::Scene b_scene;
  BL::SpaceView3D b_v3d;
  BL::RegionView3D b_rv3d;
@@ -147,6 +146,11 @@ class BlenderSession {
 protected:
  void stamp_view_layer_metadata(Scene *scene, const string &view_layer_name);

+  /* Check whether session error happened.
+   * If so, it is reported to the render engine and true is returned.
+   * Otherwise false is returned. */
+  bool check_and_report_session_error();
+
  void builtin_images_load();

  /* Is used after each render layer synchronization is done with the goal
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -855,7 +855,7 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine &b_engine,

  if (background) {
    params.use_auto_tile = RNA_boolean_get(&cscene, "use_auto_tile");
-    params.tile_size = get_int(cscene, "tile_size");
+    params.tile_size = max(get_int(cscene, "tile_size"), 8);
  }
  else {
    params.use_auto_tile = false;
--- a/intern/cycles/blender/blender_util.h
+++ b/intern/cycles/blender/blender_util.h
@@ -90,26 +90,27 @@ static inline BL::Mesh object_to_mesh(BL::BlendData & /*data*/,
  }
 #endif

-  BL::Mesh mesh(PointerRNA_NULL);
-  if (b_ob_info.object_data.is_a(&RNA_Mesh)) {
-    /* TODO: calc_undeformed is not used. */
-    mesh = BL::Mesh(b_ob_info.object_data);
+  BL::Mesh mesh = (b_ob_info.object_data.is_a(&RNA_Mesh)) ? BL::Mesh(b_ob_info.object_data) :
+                                                            BL::Mesh(PointerRNA_NULL);

-    /* Make a copy to split faces if we use autosmooth, otherwise not needed.
-     * Also in edit mode do we need to make a copy, to ensure data layers like
-     * UV are not empty. */
-    if (mesh.is_editmode() ||
-        (mesh.use_auto_smooth() && subdivision_type == Mesh::SUBDIVISION_NONE)) {
+  if (b_ob_info.is_real_object_data()) {
+    if (mesh) {
+      /* Make a copy to split faces if we use autosmooth, otherwise not needed.
+       * Also in edit mode do we need to make a copy, to ensure data layers like
+       * UV are not empty. */
+      if (mesh.is_editmode() ||
+          (mesh.use_auto_smooth() && subdivision_type == Mesh::SUBDIVISION_NONE)) {
+        BL::Depsgraph depsgraph(PointerRNA_NULL);
+        mesh = b_ob_info.real_object.to_mesh(false, depsgraph);
+      }
+    }
+    else {
      BL::Depsgraph depsgraph(PointerRNA_NULL);
-      assert(b_ob_info.is_real_object_data());
      mesh = b_ob_info.real_object.to_mesh(false, depsgraph);
    }
  }
  else {
-    BL::Depsgraph depsgraph(PointerRNA_NULL);
-    if (b_ob_info.is_real_object_data()) {
-      mesh = b_ob_info.real_object.to_mesh(false, depsgraph);
-    }
+    /* TODO: what to do about non-mesh geometry instances? */
  }

 #if 0
--- a/intern/cycles/bvh/bvh_embree.cpp
+++ b/intern/cycles/bvh/bvh_embree.cpp
@@ -213,7 +213,7 @@ static void rtc_filter_occluded_func(const RTCFilterFunctionNArguments *args)
      if (ctx->num_hits < ctx->max_hits) {
        Intersection current_isect;
        kernel_embree_convert_hit(kg, ray, hit, &current_isect);
-        for (size_t i = 0; i < ctx->max_hits; ++i) {
+        for (size_t i = 0; i < ctx->num_hits; ++i) {
          if (current_isect.object == ctx->isect_s[i].object &&
              current_isect.prim == ctx->isect_s[i].prim && current_isect.t == ctx->isect_s[i].t) {
            /* This intersection was already recorded, skip it. */
--- a/intern/cycles/cmake/external_libs.cmake
+++ b/intern/cycles/cmake/external_libs.cmake
@@ -532,4 +532,13 @@ if(WITH_CYCLES_CUDA_BINARIES OR NOT WITH_CUDA_DYNLOAD)
  endif()
 endif()

+
+###########################################################################
+# HIP
+###########################################################################
+
+if(NOT WITH_HIP_DYNLOAD)
+  set(WITH_HIP_DYNLOAD ON)
+endif()
+
 unset(_cycles_lib_dir)
--- a/intern/cycles/cmake/macros.cmake
+++ b/intern/cycles/cmake/macros.cmake
@@ -156,10 +156,16 @@ macro(cycles_target_link_libraries target)
    ${PLATFORM_LINKLIBS}
  )

-  if(WITH_CUDA_DYNLOAD)
-    target_link_libraries(${target} extern_cuew)
-  else()
-    target_link_libraries(${target} ${CUDA_CUDA_LIBRARY})
+  if(WITH_CYCLES_DEVICE_CUDA OR WITH_CYCLES_DEVICE_OPTIX)
+    if(WITH_CUDA_DYNLOAD)
+      target_link_libraries(${target} extern_cuew)
+    else()
+      target_link_libraries(${target} ${CUDA_CUDA_LIBRARY})
+    endif()
+  endif()
+
+  if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
+    target_link_libraries(${target} extern_hipew)
  endif()

  if(CYCLES_STANDALONE_REPOSITORY)
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -22,16 +22,25 @@ set(INC_SYS
  ../../../extern/clew/include
 )

-if(WITH_CUDA_DYNLOAD)
+if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
+  if(WITH_CUDA_DYNLOAD)
+    list(APPEND INC
+      ../../../extern/cuew/include
+    )
+    add_definitions(-DWITH_CUDA_DYNLOAD)
+  else()
+    list(APPEND INC_SYS
+      ${CUDA_TOOLKIT_INCLUDE}
+    )
+    add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
+  endif()
+endif()
+
+if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
  list(APPEND INC
-    ../../../extern/cuew/include
+    ../../../extern/hipew/include
  )
-  add_definitions(-DWITH_CUDA_DYNLOAD)
-else()
-  list(APPEND INC_SYS
-    ${CUDA_TOOLKIT_INCLUDE}
-  )
-  add_definitions(-DCYCLES_CUDA_NVCC_EXECUTABLE="${CUDA_NVCC_EXECUTABLE}")
+  add_definitions(-DWITH_HIP_DYNLOAD)
 endif()

 set(SRC
@@ -70,6 +79,21 @@ set(SRC_CUDA
  cuda/util.h
 )

+set(SRC_HIP
+  hip/device.cpp
+  hip/device.h
+  hip/device_impl.cpp
+  hip/device_impl.h
+  hip/graphics_interop.cpp
+  hip/graphics_interop.h
+  hip/kernel.cpp
+  hip/kernel.h
+  hip/queue.cpp
+  hip/queue.h
+  hip/util.cpp
+  hip/util.h
+)
+
 set(SRC_DUMMY
  dummy/device.cpp
  dummy/device.h
@@ -105,13 +129,21 @@ set(LIB
  ${CYCLES_GL_LIBRARIES}
 )

-if(WITH_CUDA_DYNLOAD)
+if(WITH_CYCLES_DEVICE_OPTIX OR WITH_CYCLES_DEVICE_CUDA)
+  if(WITH_CUDA_DYNLOAD)
+    list(APPEND LIB
+      extern_cuew
+    )
+  else()
+    list(APPEND LIB
+      ${CUDA_CUDA_LIBRARY}
+    )
+  endif()
+endif()
+
+if(WITH_CYCLES_DEVICE_HIP AND WITH_HIP_DYNLOAD)
  list(APPEND LIB
-    extern_cuew
-  )
-else()
-  list(APPEND LIB
-    ${CUDA_CUDA_LIBRARY}
+    extern_hipew
  )
 endif()

@@ -120,6 +152,9 @@ add_definitions(${GL_DEFINITIONS})
 if(WITH_CYCLES_DEVICE_CUDA)
  add_definitions(-DWITH_CUDA)
 endif()
+if(WITH_CYCLES_DEVICE_HIP)
+  add_definitions(-DWITH_HIP)
+endif()
 if(WITH_CYCLES_DEVICE_OPTIX)
  add_definitions(-DWITH_OPTIX)
 endif()
@@ -140,6 +175,7 @@ cycles_add_library(cycles_device "${LIB}"
  ${SRC}
  ${SRC_CPU}
  ${SRC_CUDA}
+  ${SRC_HIP}
  ${SRC_DUMMY}
  ${SRC_MULTI}
  ${SRC_OPTIX}
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -54,7 +54,6 @@
 #include "util/util_function.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
-#include "util/util_opengl.h"
 #include "util/util_openimagedenoise.h"
 #include "util/util_optimization.h"
 #include "util/util_progress.h"
@@ -170,7 +169,7 @@ void CPUDevice::mem_copy_to(device_memory &mem)
 }

 void CPUDevice::mem_copy_from(
-    device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/)
+    device_memory & /*mem*/, size_t /*y*/, size_t /*w*/, size_t /*h*/, size_t /*elem*/)
 {
  /* no-op */
 }
@@ -204,7 +203,7 @@ void CPUDevice::mem_free(device_memory &mem)
  }
 }

-device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+device_ptr CPUDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
 {
  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
 }
@@ -298,154 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
    Device::build_bvh(bvh, progress, refit);
 }

-#if 0
-void CPUDevice::render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
-{
-  const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
-
-  scoped_timer timer(&tile.buffers->render_time);
-
-  Coverage coverage(kg, tile);
-  if (use_coverage) {
-    coverage.init_path_trace();
-  }
-
-  float *render_buffer = (float *)tile.buffer;
-  int start_sample = tile.start_sample;
-  int end_sample = tile.start_sample + tile.num_samples;
-
-  /* Needed for Embree. */
-  SIMD_SET_FLUSH_TO_ZERO;
-
-  for (int sample = start_sample; sample < end_sample; sample++) {
-    if (task.get_cancel() || TaskPool::canceled()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-
-    if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
-      tile.stealing_state = RenderTile::WAS_STOLEN;
-      break;
-    }
-
-    if (tile.task == RenderTile::PATH_TRACE) {
-      for (int y = tile.y; y < tile.y + tile.h; y++) {
-        for (int x = tile.x; x < tile.x + tile.w; x++) {
-          if (use_coverage) {
-            coverage.init_pixel(x, y);
-          }
-          kernels.path_trace(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-        }
-      }
-    }
-    else {
-      for (int y = tile.y; y < tile.y + tile.h; y++) {
-        for (int x = tile.x; x < tile.x + tile.w; x++) {
-          kernels.bake(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
-        }
-      }
-    }
-    tile.sample = sample + 1;
-
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
-      const bool stop = adaptive_sampling_filter(kg, tile, sample);
-      if (stop) {
-        const int num_progress_samples = end_sample - sample;
-        tile.sample = end_sample;
-        task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
-        break;
-      }
-    }
-
-    task.update_progress(&tile, tile.w * tile.h);
-  }
-  if (use_coverage) {
-    coverage.finalize();
-  }
-
-  if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
-    adaptive_sampling_post(tile, kg);
-  }
-}
-
-void CPUDevice::thread_render(DeviceTask &task)
-{
-  if (TaskPool::canceled()) {
-    if (task.need_finish_queue == false)
-      return;
-  }
-
-  /* allocate buffer for kernel globals */
-  CPUKernelThreadGlobals kg(kernel_globals, get_cpu_osl_memory());
-
-  profiler.add_state(&kg.profiler);
-
-  /* NLM denoiser. */
-  DenoisingTask *denoising = NULL;
-
-  /* OpenImageDenoise: we can only denoise with one thread at a time, so to
-   * avoid waiting with mutex locks in the denoiser, we let only a single
-   * thread acquire denoising tiles. */
-  uint tile_types = task.tile_types;
-  bool hold_denoise_lock = false;
-  if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
-    if (!oidn_task_lock.try_lock()) {
-      tile_types &= ~RenderTile::DENOISE;
-      hold_denoise_lock = true;
-    }
-  }
-
-  RenderTile tile;
-  while (task.acquire_tile(this, tile, tile_types)) {
-    if (tile.task == RenderTile::PATH_TRACE) {
-      render(task, tile, &kg);
-    }
-    else if (tile.task == RenderTile::BAKE) {
-      render(task, tile, &kg);
-    }
-    else if (tile.task == RenderTile::DENOISE) {
-      denoise_openimagedenoise(task, tile);
-      task.update_progress(&tile, tile.w * tile.h);
-    }
-
-    task.release_tile(tile);
-
-    if (TaskPool::canceled()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  if (hold_denoise_lock) {
-    oidn_task_lock.unlock();
-  }
-
-  profiler.remove_state(&kg.profiler);
-
-  delete denoising;
-}
-
-void CPUDevice::thread_denoise(DeviceTask &task)
-{
-  RenderTile tile;
-  tile.x = task.x;
-  tile.y = task.y;
-  tile.w = task.w;
-  tile.h = task.h;
-  tile.buffer = task.buffer;
-  tile.sample = task.sample + task.num_samples;
-  tile.num_samples = task.num_samples;
-  tile.start_sample = task.sample;
-  tile.offset = task.offset;
-  tile.stride = task.stride;
-  tile.buffers = task.buffers;
-
-  denoise_openimagedenoise(task, tile);
-
-  task.update_progress(&tile, tile.w * tile.h);
-}
-#endif
-
 const CPUKernels *CPUDevice::get_cpu_kernels() const
 {
  return &kernels;
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -72,10 +72,13 @@ class CPUDevice : public Device {

  virtual void mem_alloc(device_memory &mem) override;
  virtual void mem_copy_to(device_memory &mem) override;
-  virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+  virtual void mem_copy_from(
+      device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
  virtual void mem_zero(device_memory &mem) override;
  virtual void mem_free(device_memory &mem) override;
-  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem,
+                                       size_t offset,
+                                       size_t /*size*/) override;

  virtual void const_copy_to(const char *name, void *host, size_t size) override;

--- a/intern/cycles/device/cuda/device_impl.cpp
+++ b/intern/cycles/device/cuda/device_impl.cpp
@@ -31,7 +31,6 @@
 #  include "util/util_logging.h"
 #  include "util/util_map.h"
 #  include "util/util_md5.h"
-#  include "util/util_opengl.h"
 #  include "util/util_path.h"
 #  include "util/util_string.h"
 #  include "util/util_system.h"
@@ -837,7 +836,7 @@ void CUDADevice::mem_copy_to(device_memory &mem)
  }
 }

-void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+void CUDADevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
 {
  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
    assert(!"mem_copy_from not supported for textures.");
@@ -891,7 +890,7 @@ void CUDADevice::mem_free(device_memory &mem)
  }
 }

-device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
 {
  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
 }
@@ -1169,141 +1168,6 @@ void CUDADevice::tex_free(device_texture &mem)
  }
 }

-#  if 0
-void CUDADevice::render(DeviceTask &task,
-                        RenderTile &rtile,
-                        device_vector<KernelWorkTile> &work_tiles)
-{
-  scoped_timer timer(&rtile.buffers->render_time);
-
-  if (have_error())
-    return;
-
-  CUDAContextScope scope(this);
-  CUfunction cuRender;
-
-  /* Get kernel function. */
-  if (rtile.task == RenderTile::BAKE) {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
-  }
-  else {
-    cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
-  }
-
-  if (have_error()) {
-    return;
-  }
-
-  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
-
-  /* Allocate work tile. */
-  work_tiles.alloc(1);
-
-  KernelWorkTile *wtile = work_tiles.data();
-  wtile->x = rtile.x;
-  wtile->y = rtile.y;
-  wtile->w = rtile.w;
-  wtile->h = rtile.h;
-  wtile->offset = rtile.offset;
-  wtile->stride = rtile.stride;
-  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
-
-  /* Prepare work size. More step samples render faster, but for now we
-   * remain conservative for GPUs connected to a display to avoid driver
-   * timeouts and display freezing. */
-  int min_blocks, num_threads_per_block;
-  cuda_assert(
-      cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
-  if (!info.display_device) {
-    min_blocks *= 8;
-  }
-
-  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-  /* Render all samples. */
-  uint start_sample = rtile.start_sample;
-  uint end_sample = rtile.start_sample + rtile.num_samples;
-
-  for (int sample = start_sample; sample < end_sample;) {
-    /* Setup and copy work tile to device. */
-    wtile->start_sample = sample;
-    wtile->num_samples = step_samples;
-    if (task.adaptive_sampling.use) {
-      wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
-    }
-    wtile->num_samples = min(wtile->num_samples, end_sample - sample);
-    work_tiles.copy_to_device();
-
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-    uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-    /* Launch kernel. */
-    void *args[] = {&d_work_tiles, &total_work_size};
-
-    cuda_assert(
-        cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
-
-    /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
-    uint filter_sample = sample + wtile->num_samples - 1;
-    if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
-      adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
-    }
-
-    cuda_assert(cuCtxSynchronize());
-
-    /* Update progress. */
-    sample += wtile->num_samples;
-    rtile.sample = sample;
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-
-    if (task.get_cancel()) {
-      if (task.need_finish_queue == false)
-        break;
-    }
-  }
-
-  /* Finalize adaptive sampling. */
-  if (task.adaptive_sampling.use) {
-    CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
-    adaptive_sampling_post(rtile, wtile, d_work_tiles);
-    cuda_assert(cuCtxSynchronize());
-    task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
-  }
-}
-
-void CUDADevice::thread_run(DeviceTask &task)
-{
-  CUDAContextScope scope(this);
-
-  if (task.type == DeviceTask::RENDER) {
-    device_vector<KernelWorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-    /* keep rendering tiles until done */
-    RenderTile tile;
-    DenoisingTask denoising(this, task);
-
-    while (task.acquire_tile(this, tile, task.tile_types)) {
-      if (tile.task == RenderTile::PATH_TRACE) {
-        render(task, tile, work_tiles);
-      }
-      else if (tile.task == RenderTile::BAKE) {
-        render(task, tile, work_tiles);
-      }
-
-      task.release_tile(tile);
-
-      if (task.get_cancel()) {
-        if (task.need_finish_queue == false)
-          break;
-      }
-    }
-
-    work_tiles.free();
-  }
-}
-#  endif
-
 unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
 {
  return make_unique<CUDADeviceQueue>(this);
--- a/intern/cycles/device/cuda/device_impl.h
+++ b/intern/cycles/device/cuda/device_impl.h
@@ -26,7 +26,6 @@
 #  ifdef WITH_CUDA_DYNLOAD
 #    include "cuew.h"
 #  else
-#    include "util/util_opengl.h"
 #    include <cuda.h>
 #    include <cudaGL.h>
 #  endif
@@ -120,13 +119,13 @@ class CUDADevice : public Device {

  void mem_copy_to(device_memory &mem) override;

-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override;
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;

  void mem_zero(device_memory &mem) override;

  void mem_free(device_memory &mem) override;

-  device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override;
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;

  virtual void const_copy_to(const char *name, void *host, size_t size) override;

--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -116,18 +116,18 @@ bool CUDADeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *ar
  }

  /* Launch kernel. */
-  cuda_device_assert(cuda_device_,
-                     cuLaunchKernel(cuda_kernel.function,
-                                    num_blocks,
-                                    1,
-                                    1,
-                                    num_threads_per_block,
-                                    1,
-                                    1,
-                                    shared_mem_bytes,
-                                    cuda_stream_,
-                                    args,
-                                    0));
+  assert_success(cuLaunchKernel(cuda_kernel.function,
+                                num_blocks,
+                                1,
+                                1,
+                                num_threads_per_block,
+                                1,
+                                1,
+                                shared_mem_bytes,
+                                cuda_stream_,
+                                args,
+                                0),
+                 "enqueue");

  return !(cuda_device_->have_error());
 }
@@ -139,7 +139,8 @@ bool CUDADeviceQueue::synchronize()
  }

  const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
+  assert_success(cuStreamSynchronize(cuda_stream_), "synchronize");
+
  debug_synchronize();

  return !(cuda_device_->have_error());
@@ -162,9 +163,9 @@ void CUDADeviceQueue::zero_to_device(device_memory &mem)
  assert(mem.device_pointer != 0);

  const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(
-      cuda_device_,
-      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_));
+  assert_success(
+      cuMemsetD8Async((CUdeviceptr)mem.device_pointer, 0, mem.memory_size(), cuda_stream_),
+      "zero_to_device");
 }

 void CUDADeviceQueue::copy_to_device(device_memory &mem)
@@ -185,10 +186,10 @@ void CUDADeviceQueue::copy_to_device(device_memory &mem)

  /* Copy memory to device. */
  const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(
-      cuda_device_,
+  assert_success(
      cuMemcpyHtoDAsync(
-          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_));
+          (CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size(), cuda_stream_),
+      "copy_to_device");
 }

 void CUDADeviceQueue::copy_from_device(device_memory &mem)
@@ -204,10 +205,19 @@ void CUDADeviceQueue::copy_from_device(device_memory &mem)

  /* Copy memory from device. */
  const CUDAContextScope scope(cuda_device_);
-  cuda_device_assert(
-      cuda_device_,
+  assert_success(
      cuMemcpyDtoHAsync(
-          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_));
+          mem.host_pointer, (CUdeviceptr)mem.device_pointer, mem.memory_size(), cuda_stream_),
+      "copy_from_device");
+}
+
+void CUDADeviceQueue::assert_success(CUresult result, const char *operation)
+{
+  if (result != CUDA_SUCCESS) {
+    const char *name = cuewErrorString(result);
+    cuda_device_->set_error(string_printf(
+        "%s in CUDA queue %s (%s)", name, operation, debug_active_kernels().c_str()));
+  }
 }

 unique_ptr<DeviceGraphicsInterop> CUDADeviceQueue::graphics_interop_create()
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -60,6 +60,8 @@ class CUDADeviceQueue : public DeviceQueue {
 protected:
  CUDADevice *cuda_device_;
  CUstream cuda_stream_;
+
+  void assert_success(CUresult result, const char *operation);
 };

 CCL_NAMESPACE_END
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -25,6 +25,7 @@
 #include "device/cpu/device.h"
 #include "device/cuda/device.h"
 #include "device/dummy/device.h"
+#include "device/hip/device.h"
 #include "device/multi/device.h"
 #include "device/optix/device.h"

@@ -32,7 +33,6 @@
 #include "util/util_half.h"
 #include "util/util_logging.h"
 #include "util/util_math.h"
-#include "util/util_opengl.h"
 #include "util/util_string.h"
 #include "util/util_system.h"
 #include "util/util_time.h"
@@ -47,6 +47,7 @@ thread_mutex Device::device_mutex;
 vector<DeviceInfo> Device::cuda_devices;
 vector<DeviceInfo> Device::optix_devices;
 vector<DeviceInfo> Device::cpu_devices;
+vector<DeviceInfo> Device::hip_devices;
 uint Device::devices_initialized_mask = 0;

 /* Device */
@@ -97,6 +98,14 @@ Device *Device::create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
        device = device_optix_create(info, stats, profiler);
      break;
 #endif
+
+#ifdef WITH_HIP
+    case DEVICE_HIP:
+      if (device_hip_init())
+        device = device_hip_create(info, stats, profiler);
+      break;
+#endif
+
    default:
      break;
  }
@@ -118,6 +127,8 @@ DeviceType Device::type_from_string(const char *name)
    return DEVICE_OPTIX;
  else if (strcmp(name, "MULTI") == 0)
    return DEVICE_MULTI;
+  else if (strcmp(name, "HIP") == 0)
+    return DEVICE_HIP;

  return DEVICE_NONE;
 }
@@ -132,6 +143,8 @@ string Device::string_from_type(DeviceType type)
    return "OPTIX";
  else if (type == DEVICE_MULTI)
    return "MULTI";
+  else if (type == DEVICE_HIP)
+    return "HIP";

  return "";
 }
@@ -146,6 +159,10 @@ vector<DeviceType> Device::available_types()
 #ifdef WITH_OPTIX
  types.push_back(DEVICE_OPTIX);
 #endif
+#ifdef WITH_HIP
+  types.push_back(DEVICE_HIP);
+#endif
+
  return types;
 }

@@ -187,6 +204,20 @@ vector<DeviceInfo> Device::available_devices(uint mask)
  }
 #endif

+#ifdef WITH_HIP
+  if (mask & DEVICE_MASK_HIP) {
+    if (!(devices_initialized_mask & DEVICE_MASK_HIP)) {
+      if (device_hip_init()) {
+        device_hip_info(hip_devices);
+      }
+      devices_initialized_mask |= DEVICE_MASK_HIP;
+    }
+    foreach (DeviceInfo &info, hip_devices) {
+      devices.push_back(info);
+    }
+  }
+#endif
+
  if (mask & DEVICE_MASK_CPU) {
    if (!(devices_initialized_mask & DEVICE_MASK_CPU)) {
      device_cpu_info(cpu_devices);
@@ -227,6 +258,15 @@ string Device::device_capabilities(uint mask)
  }
 #endif

+#ifdef WITH_HIP
+  if (mask & DEVICE_MASK_HIP) {
+    if (device_hip_init()) {
+      capabilities += "\nHIP device capabilities:\n";
+      capabilities += device_hip_capabilities();
+    }
+  }
+#endif
+
  return capabilities;
 }

@@ -315,6 +355,7 @@ void Device::free_memory()
  devices_initialized_mask = 0;
  cuda_devices.free_memory();
  optix_devices.free_memory();
+  hip_devices.free_memory();
  cpu_devices.free_memory();
 }

--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -51,6 +51,7 @@ enum DeviceType {
  DEVICE_CUDA,
  DEVICE_MULTI,
  DEVICE_OPTIX,
+  DEVICE_HIP,
  DEVICE_DUMMY,
 };

@@ -58,6 +59,7 @@ enum DeviceTypeMask {
  DEVICE_MASK_CPU = (1 << DEVICE_CPU),
  DEVICE_MASK_CUDA = (1 << DEVICE_CUDA),
  DEVICE_MASK_OPTIX = (1 << DEVICE_OPTIX),
+  DEVICE_MASK_HIP = (1 << DEVICE_HIP),
  DEVICE_MASK_ALL = ~0
 };

@@ -119,7 +121,7 @@ class Device {

  string error_msg;

-  virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, int /*offset*/, int /*size*/)
+  virtual device_ptr mem_alloc_sub_ptr(device_memory & /*mem*/, size_t /*offset*/, size_t /*size*/)
  {
    /* Only required for devices that implement denoising. */
    assert(false);
@@ -273,7 +275,7 @@ class Device {

  virtual void mem_alloc(device_memory &mem) = 0;
  virtual void mem_copy_to(device_memory &mem) = 0;
-  virtual void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) = 0;
+  virtual void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) = 0;
  virtual void mem_zero(device_memory &mem) = 0;
  virtual void mem_free(device_memory &mem) = 0;

@@ -284,6 +286,7 @@ class Device {
  static vector<DeviceInfo> cuda_devices;
  static vector<DeviceInfo> optix_devices;
  static vector<DeviceInfo> cpu_devices;
+  static vector<DeviceInfo> hip_devices;
  static uint devices_initialized_mask;
 };

--- a/intern/cycles/device/device_memory.cpp
+++ b/intern/cycles/device/device_memory.cpp
@@ -136,7 +136,7 @@ void device_memory::device_copy_to()
  }
 }

-void device_memory::device_copy_from(int y, int w, int h, int elem)
+void device_memory::device_copy_from(size_t y, size_t w, size_t h, size_t elem)
 {
  assert(type != MEM_TEXTURE && type != MEM_READ_ONLY && type != MEM_GLOBAL);
  device->mem_copy_from(*this, y, w, h, elem);
@@ -181,7 +181,7 @@ bool device_memory::is_resident(Device *sub_device) const

 /* Device Sub Ptr */

-device_sub_ptr::device_sub_ptr(device_memory &mem, int offset, int size) : device(mem.device)
+device_sub_ptr::device_sub_ptr(device_memory &mem, size_t offset, size_t size) : device(mem.device)
 {
  ptr = device->mem_alloc_sub_ptr(mem, offset, size);
 }
--- a/intern/cycles/device/device_memory.h
+++ b/intern/cycles/device/device_memory.h
@@ -81,154 +81,154 @@ static constexpr size_t datatype_size(DataType datatype)

 template<typename T> struct device_type_traits {
  static const DataType data_type = TYPE_UNKNOWN;
-  static const int num_elements_cpu = sizeof(T);
-  static const int num_elements_gpu = sizeof(T);
+  static const size_t num_elements_cpu = sizeof(T);
+  static const size_t num_elements_gpu = sizeof(T);
 };

 template<> struct device_type_traits<uchar> {
  static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
  static_assert(sizeof(uchar) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<uchar2> {
  static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements_cpu = 2;
-  static const int num_elements_gpu = 2;
+  static const size_t num_elements_cpu = 2;
+  static const size_t num_elements_gpu = 2;
  static_assert(sizeof(uchar2) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<uchar3> {
  static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements_cpu = 3;
-  static const int num_elements_gpu = 3;
+  static const size_t num_elements_cpu = 3;
+  static const size_t num_elements_gpu = 3;
  static_assert(sizeof(uchar3) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<uchar4> {
  static const DataType data_type = TYPE_UCHAR;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
  static_assert(sizeof(uchar4) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint> {
  static const DataType data_type = TYPE_UINT;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
  static_assert(sizeof(uint) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint2> {
  static const DataType data_type = TYPE_UINT;
-  static const int num_elements_cpu = 2;
-  static const int num_elements_gpu = 2;
+  static const size_t num_elements_cpu = 2;
+  static const size_t num_elements_gpu = 2;
  static_assert(sizeof(uint2) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint3> {
  static const DataType data_type = TYPE_UINT;
-  static const int num_elements_cpu = 3;
-  static const int num_elements_gpu = 3;
+  static const size_t num_elements_cpu = 3;
+  static const size_t num_elements_gpu = 3;
  static_assert(sizeof(uint3) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint4> {
  static const DataType data_type = TYPE_UINT;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
  static_assert(sizeof(uint4) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<int> {
  static const DataType data_type = TYPE_INT;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
  static_assert(sizeof(int) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<int2> {
  static const DataType data_type = TYPE_INT;
-  static const int num_elements_cpu = 2;
-  static const int num_elements_gpu = 2;
+  static const size_t num_elements_cpu = 2;
+  static const size_t num_elements_gpu = 2;
  static_assert(sizeof(int2) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<int3> {
  static const DataType data_type = TYPE_INT;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 3;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 3;
  static_assert(sizeof(int3) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<int4> {
  static const DataType data_type = TYPE_INT;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
  static_assert(sizeof(int4) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<float> {
  static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
  static_assert(sizeof(float) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<float2> {
  static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements_cpu = 2;
-  static const int num_elements_gpu = 2;
+  static const size_t num_elements_cpu = 2;
+  static const size_t num_elements_gpu = 2;
  static_assert(sizeof(float2) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<float3> {
  static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 3;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 3;
  static_assert(sizeof(float3) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<float4> {
  static const DataType data_type = TYPE_FLOAT;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
  static_assert(sizeof(float4) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<half> {
  static const DataType data_type = TYPE_HALF;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
  static_assert(sizeof(half) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<ushort4> {
  static const DataType data_type = TYPE_UINT16;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
  static_assert(sizeof(ushort4) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint16_t> {
  static const DataType data_type = TYPE_UINT16;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
  static_assert(sizeof(uint16_t) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<half4> {
  static const DataType data_type = TYPE_HALF;
-  static const int num_elements_cpu = 4;
-  static const int num_elements_gpu = 4;
+  static const size_t num_elements_cpu = 4;
+  static const size_t num_elements_gpu = 4;
  static_assert(sizeof(half4) == num_elements_cpu * datatype_size(data_type));
 };

 template<> struct device_type_traits<uint64_t> {
  static const DataType data_type = TYPE_UINT64;
-  static const int num_elements_cpu = 1;
-  static const int num_elements_gpu = 1;
+  static const size_t num_elements_cpu = 1;
+  static const size_t num_elements_gpu = 1;
  static_assert(sizeof(uint64_t) == num_elements_cpu * datatype_size(data_type));
 };

@@ -277,6 +277,7 @@ class device_memory {
 protected:
  friend class CUDADevice;
  friend class OptiXDevice;
+  friend class HIPDevice;

  /* Only create through subclasses. */
  device_memory(Device *device, const char *name, MemoryType type);
@@ -296,7 +297,7 @@ class device_memory {
  void device_alloc();
  void device_free();
  void device_copy_to();
-  void device_copy_from(int y, int w, int h, int elem);
+  void device_copy_from(size_t y, size_t w, size_t h, size_t elem);
  void device_zero();

  bool device_is_cpu();
@@ -565,7 +566,7 @@ template<typename T> class device_vector : public device_memory {
    device_copy_from(0, data_width, (data_height == 0) ? 1 : data_height, sizeof(T));
  }

-  void copy_from_device(int y, int w, int h)
+  void copy_from_device(size_t y, size_t w, size_t h)
  {
    device_copy_from(y, w, h, sizeof(T));
  }
@@ -601,7 +602,7 @@ template<typename T> class device_vector : public device_memory {

 class device_sub_ptr {
 public:
-  device_sub_ptr(device_memory &mem, int offset, int size);
+  device_sub_ptr(device_memory &mem, size_t offset, size_t size);
  ~device_sub_ptr();

  device_ptr operator*() const
--- a/intern/cycles/device/device_queue.cpp
+++ b/intern/cycles/device/device_queue.cpp
@@ -57,8 +57,9 @@ void DeviceQueue::debug_init_execution()
 {
  if (VLOG_IS_ON(3)) {
    last_sync_time_ = time_dt();
-    last_kernels_enqueued_ = 0;
  }
+
+  last_kernels_enqueued_ = 0;
 }

 void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
@@ -66,8 +67,9 @@ void DeviceQueue::debug_enqueue(DeviceKernel kernel, const int work_size)
  if (VLOG_IS_ON(3)) {
    VLOG(4) << "GPU queue launch " << device_kernel_as_string(kernel) << ", work_size "
            << work_size;
-    last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
  }
+
+  last_kernels_enqueued_ |= (uint64_t(1) << (uint64_t)kernel);
 }

 void DeviceQueue::debug_synchronize()
@@ -80,8 +82,14 @@ void DeviceQueue::debug_synchronize()
    stats_kernel_time_[last_kernels_enqueued_] += elapsed_time;

    last_sync_time_ = new_time;
-    last_kernels_enqueued_ = 0;
  }
+
+  last_kernels_enqueued_ = 0;
+}
+
+string DeviceQueue::debug_active_kernels()
+{
+  return device_kernel_mask_as_string(last_kernels_enqueued_);
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/device/device_queue.h
+++ b/intern/cycles/device/device_queue.h
@@ -21,6 +21,7 @@
 #include "device/device_graphics_interop.h"
 #include "util/util_logging.h"
 #include "util/util_map.h"
+#include "util/util_string.h"
 #include "util/util_unique_ptr.h"

 CCL_NAMESPACE_BEGIN
@@ -101,6 +102,7 @@ class DeviceQueue {
  void debug_init_execution();
  void debug_enqueue(DeviceKernel kernel, const int work_size);
  void debug_synchronize();
+  string debug_active_kernels();

  /* Combination of kernels enqueued together sync last synchronize. */
  DeviceKernelMask last_kernels_enqueued_;
--- a/intern/cycles/device/dummy/device.cpp
+++ b/intern/cycles/device/dummy/device.cpp
@@ -48,7 +48,7 @@ class DummyDevice : public Device {
  {
  }

-  virtual void mem_copy_from(device_memory &, int, int, int, int) override
+  virtual void mem_copy_from(device_memory &, size_t, size_t, size_t, size_t) override
  {
  }

--- a/intern/cycles/device/hip/device.cpp
+++ b/intern/cycles/device/hip/device.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device/hip/device.h"
+
+#include "util/util_logging.h"
+
+#ifdef WITH_HIP
+#  include "device/device.h"
+#  include "device/hip/device_impl.h"
+
+#  include "util/util_string.h"
+#  include "util/util_windows.h"
+#endif /* WITH_HIP */
+
+CCL_NAMESPACE_BEGIN
+
+bool device_hip_init()
+{
+#if !defined(WITH_HIP)
+  return false;
+#elif defined(WITH_HIP_DYNLOAD)
+  static bool initialized = false;
+  static bool result = false;
+
+  if (initialized)
+    return result;
+
+  initialized = true;
+  int hipew_result = hipewInit(HIPEW_INIT_HIP);
+  if (hipew_result == HIPEW_SUCCESS) {
+    VLOG(1) << "HIPEW initialization succeeded";
+    if (HIPDevice::have_precompiled_kernels()) {
+      VLOG(1) << "Found precompiled kernels";
+      result = true;
+    }
+    else if (hipewCompilerPath() != NULL) {
+      VLOG(1) << "Found HIPCC " << hipewCompilerPath();
+      result = true;
+    }
+    else {
+      VLOG(1) << "Neither precompiled kernels nor HIPCC was found,"
+              << " unable to use HIP";
+    }
+  }
+  else {
+    VLOG(1) << "HIPEW initialization failed: "
+            << ((hipew_result == HIPEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
+                                                              "Error opening the library");
+  }
+
+  return result;
+#else  /* WITH_HIP_DYNLOAD */
+  return true;
+#endif /* WITH_HIP_DYNLOAD */
+}
+
+Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler)
+{
+#ifdef WITH_HIP
+  return new HIPDevice(info, stats, profiler);
+#else
+  (void)info;
+  (void)stats;
+  (void)profiler;
+
+  LOG(FATAL) << "Request to create HIP device without compiled-in support. Should never happen.";
+
+  return nullptr;
+#endif
+}
+
+#ifdef WITH_HIP
+static hipError_t device_hip_safe_init()
+{
+#  ifdef _WIN32
+  __try {
+    return hipInit(0);
+  }
+  __except (EXCEPTION_EXECUTE_HANDLER) {
+    /* Ignore crashes inside the HIP driver and hope we can
+     * survive even with corrupted HIP installs. */
+    fprintf(stderr, "Cycles HIP: driver crashed, continuing without HIP.\n");
+  }
+
+  return hipErrorNoDevice;
+#  else
+  return hipInit(0);
+#  endif
+}
+#endif /* WITH_HIP */
+
+void device_hip_info(vector<DeviceInfo> &devices)
+{
+#ifdef WITH_HIP
+  hipError_t result = device_hip_safe_init();
+  if (result != hipSuccess) {
+    if (result != hipErrorNoDevice)
+      fprintf(stderr, "HIP hipInit: %s\n", hipewErrorString(result));
+    return;
+  }
+
+  int count = 0;
+  result = hipGetDeviceCount(&count);
+  if (result != hipSuccess) {
+    fprintf(stderr, "HIP hipGetDeviceCount: %s\n", hipewErrorString(result));
+    return;
+  }
+
+  vector<DeviceInfo> display_devices;
+
+  for (int num = 0; num < count; num++) {
+    char name[256];
+
+    result = hipDeviceGetName(name, 256, num);
+    if (result != hipSuccess) {
+      fprintf(stderr, "HIP :hipDeviceGetName: %s\n", hipewErrorString(result));
+      continue;
+    }
+
+    int major;
+    hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, num);
+    // TODO : (Arya) What is the last major version we are supporting?
+
+    DeviceInfo info;
+
+    info.type = DEVICE_HIP;
+    info.description = string(name);
+    info.num = num;
+
+    info.has_half_images = (major >= 3);
+    info.has_nanovdb = true;
+    info.denoisers = 0;
+
+    info.has_gpu_queue = true;
+    /* Check if the device has P2P access to any other device in the system. */
+    for (int peer_num = 0; peer_num < count && !info.has_peer_memory; peer_num++) {
+      if (num != peer_num) {
+        int can_access = 0;
+        hipDeviceCanAccessPeer(&can_access, num, peer_num);
+        info.has_peer_memory = (can_access != 0);
+      }
+    }
+
+    int pci_location[3] = {0, 0, 0};
+    hipDeviceGetAttribute(&pci_location[0], hipDeviceAttributePciDomainID, num);
+    hipDeviceGetAttribute(&pci_location[1], hipDeviceAttributePciBusId, num);
+    hipDeviceGetAttribute(&pci_location[2], hipDeviceAttributePciDeviceId, num);
+    info.id = string_printf("HIP_%s_%04x:%02x:%02x",
+                            name,
+                            (unsigned int)pci_location[0],
+                            (unsigned int)pci_location[1],
+                            (unsigned int)pci_location[2]);
+
+    /* If device has a kernel timeout and no compute preemption, we assume
+     * it is connected to a display and will freeze the display while doing
+     * computations. */
+    int timeout_attr = 0, preempt_attr = 0;
+    hipDeviceGetAttribute(&timeout_attr, hipDeviceAttributeKernelExecTimeout, num);
+
+    if (timeout_attr && !preempt_attr) {
+      VLOG(1) << "Device is recognized as display.";
+      info.description += " (Display)";
+      info.display_device = true;
+      display_devices.push_back(info);
+    }
+    else {
+      VLOG(1) << "Device has compute preemption or is not used for display.";
+      devices.push_back(info);
+    }
+    VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+  }
+
+  if (!display_devices.empty())
+    devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+#else  /* WITH_HIP */
+  (void)devices;
+#endif /* WITH_HIP */
+}
+
+string device_hip_capabilities()
+{
+#ifdef WITH_HIP
+  hipError_t result = device_hip_safe_init();
+  if (result != hipSuccess) {
+    if (result != hipErrorNoDevice) {
+      return string("Error initializing HIP: ") + hipewErrorString(result);
+    }
+    return "No HIP device found\n";
+  }
+
+  int count;
+  result = hipGetDeviceCount(&count);
+  if (result != hipSuccess) {
+    return string("Error getting devices: ") + hipewErrorString(result);
+  }
+
+  string capabilities = "";
+  for (int num = 0; num < count; num++) {
+    char name[256];
+    if (hipDeviceGetName(name, 256, num) != hipSuccess) {
+      continue;
+    }
+    capabilities += string("\t") + name + "\n";
+    int value;
+#  define GET_ATTR(attr) \
+    { \
+      if (hipDeviceGetAttribute(&value, hipDeviceAttribute##attr, num) == hipSuccess) { \
+        capabilities += string_printf("\t\thipDeviceAttribute" #attr "\t\t\t%d\n", value); \
+      } \
+    } \
+    (void)0
+    /* TODO(sergey): Strip all attributes which are not useful for us
+     * or does not depend on the driver.
+     */
+    GET_ATTR(MaxThreadsPerBlock);
+    GET_ATTR(MaxBlockDimX);
+    GET_ATTR(MaxBlockDimY);
+    GET_ATTR(MaxBlockDimZ);
+    GET_ATTR(MaxGridDimX);
+    GET_ATTR(MaxGridDimY);
+    GET_ATTR(MaxGridDimZ);
+    GET_ATTR(MaxSharedMemoryPerBlock);
+    GET_ATTR(TotalConstantMemory);
+    GET_ATTR(WarpSize);
+    GET_ATTR(MaxPitch);
+    GET_ATTR(MaxRegistersPerBlock);
+    GET_ATTR(ClockRate);
+    GET_ATTR(TextureAlignment);
+    GET_ATTR(MultiprocessorCount);
+    GET_ATTR(KernelExecTimeout);
+    GET_ATTR(Integrated);
+    GET_ATTR(CanMapHostMemory);
+    GET_ATTR(ComputeMode);
+    GET_ATTR(MaxTexture1DWidth);
+    GET_ATTR(MaxTexture2DWidth);
+    GET_ATTR(MaxTexture2DHeight);
+    GET_ATTR(MaxTexture3DWidth);
+    GET_ATTR(MaxTexture3DHeight);
+    GET_ATTR(MaxTexture3DDepth);
+    GET_ATTR(ConcurrentKernels);
+    GET_ATTR(EccEnabled);
+    GET_ATTR(MemoryClockRate);
+    GET_ATTR(MemoryBusWidth);
+    GET_ATTR(L2CacheSize);
+    GET_ATTR(MaxThreadsPerMultiProcessor);
+    GET_ATTR(ComputeCapabilityMajor);
+    GET_ATTR(ComputeCapabilityMinor);
+    GET_ATTR(MaxSharedMemoryPerMultiprocessor);
+    GET_ATTR(ManagedMemory);
+    GET_ATTR(IsMultiGpuBoard);
+#  undef GET_ATTR
+    capabilities += "\n";
+  }
+
+  return capabilities;
+
+#else  /* WITH_HIP */
+  return "";
+#endif /* WITH_HIP */
+}
+
+CCL_NAMESPACE_END
--- a/intern/cycles/device/hip/device.h
+++ b/intern/cycles/device/hip/device.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "util/util_string.h"
+#include "util/util_vector.h"
+
+CCL_NAMESPACE_BEGIN
+
+class Device;
+class DeviceInfo;
+class Profiler;
+class Stats;
+
+bool device_hip_init();
+
+Device *device_hip_create(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+void device_hip_info(vector<DeviceInfo> &devices);
+
+string device_hip_capabilities();
+
+CCL_NAMESPACE_END
--- a/intern/cycles/device/hip/device_impl.cpp
+++ b/intern/cycles/device/hip/device_impl.cpp
--- a/intern/cycles/device/hip/device_impl.h
+++ b/intern/cycles/device/hip/device_impl.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/device.h"
+#  include "device/hip/kernel.h"
+#  include "device/hip/queue.h"
+#  include "device/hip/util.h"
+
+#  include "util/util_map.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  else
+#    include "util/util_opengl.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class DeviceQueue;
+
+class HIPDevice : public Device {
+
+  friend class HIPContextScope;
+
+ public:
+  hipDevice_t hipDevice;
+  hipCtx_t hipContext;
+  hipModule_t hipModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int pitch_alignment;
+  int hipDevId;
+  int hipDevArchitecture;
+  bool first_error;
+
+  struct HIPMem {
+    HIPMem() : texobject(0), array(0), use_mapped_host(false)
+    {
+    }
+
+    hipTextureObject_t texobject;
+    hArray array;
+
+    /* If true, a mapped host memory in shared_pointer is being used. */
+    bool use_mapped_host;
+  };
+  typedef map<device_memory *, HIPMem> HIPMemMap;
+  HIPMemMap hip_mem_map;
+  thread_mutex hip_mem_map_mutex;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  HIPDeviceKernels kernels;
+
+  static bool have_precompiled_kernels();
+
+  virtual bool show_samples() const override;
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const override;
+
+  void set_error(const string &error) override;
+
+  HIPDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler);
+
+  virtual ~HIPDevice();
+
+  bool support_device(const uint /*kernel_features*/);
+
+  bool check_peer_access(Device *peer_device) override;
+
+  bool use_adaptive_compilation();
+
+  virtual string compile_kernel_get_common_cflags(const uint kernel_features);
+
+  string compile_kernel(const uint kernel_features,
+                        const char *name,
+                        const char *base = "hip",
+                        bool force_ptx = false);
+
+  virtual bool load_kernels(const uint kernel_features) override;
+  void reserve_local_memory(const uint kernel_features);
+
+  void init_host_memory();
+
+  void load_texture_info();
+
+  void move_textures_to_host(size_t size, bool for_texture);
+
+  HIPMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0);
+
+  void generic_copy_to(device_memory &mem);
+
+  void generic_free(device_memory &mem);
+
+  void mem_alloc(device_memory &mem) override;
+
+  void mem_copy_to(device_memory &mem) override;
+
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override;
+
+  void mem_zero(device_memory &mem) override;
+
+  void mem_free(device_memory &mem) override;
+
+  device_ptr mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/) override;
+
+  virtual void const_copy_to(const char *name, void *host, size_t size) override;
+
+  void global_alloc(device_memory &mem);
+
+  void global_free(device_memory &mem);
+
+  void tex_alloc(device_texture &mem);
+
+  void tex_free(device_texture &mem);
+
+  /* Graphics resources interoperability. */
+  virtual bool should_use_graphics_interop() override;
+
+  virtual unique_ptr<DeviceQueue> gpu_queue_create() override;
+
+  int get_num_multiprocessors();
+  int get_max_num_threads_per_multiprocessor();
+
+ protected:
+  bool get_device_attribute(hipDeviceAttribute_t attribute, int *value);
+  int get_device_default_attribute(hipDeviceAttribute_t attribute, int default_value);
+};
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/device/hip/graphics_interop.cpp
+++ b/intern/cycles/device/hip/graphics_interop.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/graphics_interop.h"
+
+#  include "device/hip/device_impl.h"
+#  include "device/hip/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+HIPDeviceGraphicsInterop::HIPDeviceGraphicsInterop(HIPDeviceQueue *queue)
+    : queue_(queue), device_(static_cast<HIPDevice *>(queue->device))
+{
+}
+
+HIPDeviceGraphicsInterop::~HIPDeviceGraphicsInterop()
+{
+  HIPContextScope scope(device_);
+
+  if (hip_graphics_resource_) {
+    hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_));
+  }
+}
+
+void HIPDeviceGraphicsInterop::set_destination(const DeviceGraphicsInteropDestination &destination)
+{
+  const int64_t new_buffer_area = int64_t(destination.buffer_width) * destination.buffer_height;
+
+  if (opengl_pbo_id_ == destination.opengl_pbo_id && buffer_area_ == new_buffer_area) {
+    return;
+  }
+
+  HIPContextScope scope(device_);
+
+  if (hip_graphics_resource_) {
+    hip_device_assert(device_, hipGraphicsUnregisterResource(hip_graphics_resource_));
+  }
+
+  const hipError_t result = hipGraphicsGLRegisterBuffer(
+      &hip_graphics_resource_, destination.opengl_pbo_id, hipGraphicsRegisterFlagsNone);
+  if (result != hipSuccess) {
+    LOG(ERROR) << "Error registering OpenGL buffer: " << hipewErrorString(result);
+  }
+
+  opengl_pbo_id_ = destination.opengl_pbo_id;
+  buffer_area_ = new_buffer_area;
+}
+
+device_ptr HIPDeviceGraphicsInterop::map()
+{
+  if (!hip_graphics_resource_) {
+    return 0;
+  }
+
+  HIPContextScope scope(device_);
+
+  hipDeviceptr_t hip_buffer;
+  size_t bytes;
+
+  hip_device_assert(device_,
+                    hipGraphicsMapResources(1, &hip_graphics_resource_, queue_->stream()));
+  hip_device_assert(
+      device_, hipGraphicsResourceGetMappedPointer(&hip_buffer, &bytes, hip_graphics_resource_));
+
+  return static_cast<device_ptr>(hip_buffer);
+}
+
+void HIPDeviceGraphicsInterop::unmap()
+{
+  HIPContextScope scope(device_);
+
+  hip_device_assert(device_,
+                    hipGraphicsUnmapResources(1, &hip_graphics_resource_, queue_->stream()));
+}
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/device/hip/graphics_interop.h
+++ b/intern/cycles/device/hip/graphics_interop.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/device_graphics_interop.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+class HIPDeviceQueue;
+
+class HIPDeviceGraphicsInterop : public DeviceGraphicsInterop {
+ public:
+  explicit HIPDeviceGraphicsInterop(HIPDeviceQueue *queue);
+
+  HIPDeviceGraphicsInterop(const HIPDeviceGraphicsInterop &other) = delete;
+  HIPDeviceGraphicsInterop(HIPDeviceGraphicsInterop &&other) noexcept = delete;
+
+  ~HIPDeviceGraphicsInterop();
+
+  HIPDeviceGraphicsInterop &operator=(const HIPDeviceGraphicsInterop &other) = delete;
+  HIPDeviceGraphicsInterop &operator=(HIPDeviceGraphicsInterop &&other) = delete;
+
+  virtual void set_destination(const DeviceGraphicsInteropDestination &destination) override;
+
+  virtual device_ptr map() override;
+  virtual void unmap() override;
+
+ protected:
+  HIPDeviceQueue *queue_ = nullptr;
+  HIPDevice *device_ = nullptr;
+
+  /* OpenGL PBO which is currently registered as the destination for the CUDA buffer. */
+  uint opengl_pbo_id_ = 0;
+  /* Buffer area in pixels of the corresponding PBO. */
+  int64_t buffer_area_ = 0;
+
+  hipGraphicsResource hip_graphics_resource_ = nullptr;
+};
+
+CCL_NAMESPACE_END
+
+#endif
--- a/intern/cycles/device/hip/kernel.cpp
+++ b/intern/cycles/device/hip/kernel.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/kernel.h"
+#  include "device/hip/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+void HIPDeviceKernels::load(HIPDevice *device)
+{
+  hipModule_t hipModule = device->hipModule;
+
+  for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
+    HIPDeviceKernel &kernel = kernels_[i];
+
+    /* No megakernel used for GPU. */
+    if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      continue;
+    }
+
+    const std::string function_name = std::string("kernel_gpu_") +
+                                      device_kernel_as_string((DeviceKernel)i);
+    hip_device_assert(device,
+                      hipModuleGetFunction(&kernel.function, hipModule, function_name.c_str()));
+
+    if (kernel.function) {
+      hip_device_assert(device, hipFuncSetCacheConfig(kernel.function, hipFuncCachePreferL1));
+
+      hip_device_assert(
+          device,
+          hipModuleOccupancyMaxPotentialBlockSize(
+              &kernel.min_blocks, &kernel.num_threads_per_block, kernel.function, 0, 0));
+    }
+    else {
+      LOG(ERROR) << "Unable to load kernel " << function_name;
+    }
+  }
+
+  loaded = true;
+}
+
+const HIPDeviceKernel &HIPDeviceKernels::get(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel];
+}
+
+bool HIPDeviceKernels::available(DeviceKernel kernel) const
+{
+  return kernels_[(int)kernel].function != nullptr;
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP*/
--- a/intern/cycles/device/hip/kernel.h
+++ b/intern/cycles/device/hip/kernel.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  include "device/device_kernel.h"
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+
+/* HIP kernel and associate occupancy information. */
+class HIPDeviceKernel {
+ public:
+  hipFunction_t function = nullptr;
+
+  int num_threads_per_block = 0;
+  int min_blocks = 0;
+};
+
+/* Cache of HIP kernels for each DeviceKernel. */
+class HIPDeviceKernels {
+ public:
+  void load(HIPDevice *device);
+  const HIPDeviceKernel &get(DeviceKernel kernel) const;
+  bool available(DeviceKernel kernel) const;
+
+ protected:
+  HIPDeviceKernel kernels_[DEVICE_KERNEL_NUM];
+  bool loaded = false;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
--- a/intern/cycles/device/hip/queue.cpp
+++ b/intern/cycles/device/hip/queue.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/queue.h"
+
+#  include "device/hip/device_impl.h"
+#  include "device/hip/graphics_interop.h"
+#  include "device/hip/kernel.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* HIPDeviceQueue */
+
+HIPDeviceQueue::HIPDeviceQueue(HIPDevice *device)
+    : DeviceQueue(device), hip_device_(device), hip_stream_(nullptr)
+{
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(hip_device_, hipStreamCreateWithFlags(&hip_stream_, hipStreamNonBlocking));
+}
+
+HIPDeviceQueue::~HIPDeviceQueue()
+{
+  const HIPContextScope scope(hip_device_);
+  hipStreamDestroy(hip_stream_);
+}
+
+int HIPDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
+{
+  /* TODO: compute automatically. */
+  /* TODO: must have at least num_threads_per_block. */
+  return 14416128;
+}
+
+int HIPDeviceQueue::num_concurrent_busy_states() const
+{
+  const int max_num_threads = hip_device_->get_num_multiprocessors() *
+                              hip_device_->get_max_num_threads_per_multiprocessor();
+
+  if (max_num_threads == 0) {
+    return 65536;
+  }
+
+  return 4 * max_num_threads;
+}
+
+void HIPDeviceQueue::init_execution()
+{
+  /* Synchronize all textures and memory copies before executing task. */
+  HIPContextScope scope(hip_device_);
+  hip_device_->load_texture_info();
+  hip_device_assert(hip_device_, hipDeviceSynchronize());
+
+  debug_init_execution();
+}
+
+bool HIPDeviceQueue::kernel_available(DeviceKernel kernel) const
+{
+  return hip_device_->kernels.available(kernel);
+}
+
+bool HIPDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *args[])
+{
+  if (hip_device_->have_error()) {
+    return false;
+  }
+
+  debug_enqueue(kernel, work_size);
+
+  const HIPContextScope scope(hip_device_);
+  const HIPDeviceKernel &hip_kernel = hip_device_->kernels.get(kernel);
+
+  /* Compute kernel launch parameters. */
+  const int num_threads_per_block = hip_kernel.num_threads_per_block;
+  const int num_blocks = divide_up(work_size, num_threads_per_block);
+
+  int shared_mem_bytes = 0;
+
+  switch (kernel) {
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY:
+    case DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY:
+      /* See parall_active_index.h for why this amount of shared memory is needed. */
+      shared_mem_bytes = (num_threads_per_block + 1) * sizeof(int);
+      break;
+    default:
+      break;
+  }
+
+  /* Launch kernel. */
+  hip_device_assert(hip_device_,
+                    hipModuleLaunchKernel(hip_kernel.function,
+                                          num_blocks,
+                                          1,
+                                          1,
+                                          num_threads_per_block,
+                                          1,
+                                          1,
+                                          shared_mem_bytes,
+                                          hip_stream_,
+                                          args,
+                                          0));
+  return !(hip_device_->have_error());
+}
+
+bool HIPDeviceQueue::synchronize()
+{
+  if (hip_device_->have_error()) {
+    return false;
+  }
+
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(hip_device_, hipStreamSynchronize(hip_stream_));
+  debug_synchronize();
+
+  return !(hip_device_->have_error());
+}
+
+void HIPDeviceQueue::zero_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    hip_device_->mem_alloc(mem);
+  }
+
+  /* Zero memory on device. */
+  assert(mem.device_pointer != 0);
+
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemsetD8Async((hipDeviceptr_t)mem.device_pointer, 0, mem.memory_size(), hip_stream_));
+}
+
+void HIPDeviceQueue::copy_to_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  /* Allocate on demand. */
+  if (mem.device_pointer == 0) {
+    hip_device_->mem_alloc(mem);
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory to device. */
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemcpyHtoDAsync(
+          (hipDeviceptr_t)mem.device_pointer, mem.host_pointer, mem.memory_size(), hip_stream_));
+}
+
+void HIPDeviceQueue::copy_from_device(device_memory &mem)
+{
+  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
+
+  if (mem.memory_size() == 0) {
+    return;
+  }
+
+  assert(mem.device_pointer != 0);
+  assert(mem.host_pointer != nullptr);
+
+  /* Copy memory from device. */
+  const HIPContextScope scope(hip_device_);
+  hip_device_assert(
+      hip_device_,
+      hipMemcpyDtoHAsync(
+          mem.host_pointer, (hipDeviceptr_t)mem.device_pointer, mem.memory_size(), hip_stream_));
+}
+
+// TODO : (Arya) Enable this after stabilizing dev branch
+unique_ptr<DeviceGraphicsInterop> HIPDeviceQueue::graphics_interop_create()
+{
+  return make_unique<HIPDeviceGraphicsInterop>(this);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
--- a/intern/cycles/device/hip/queue.h
+++ b/intern/cycles/device/hip/queue.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  include "device/device_kernel.h"
+#  include "device/device_memory.h"
+#  include "device/device_queue.h"
+
+#  include "device/hip/util.h"
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+class device_memory;
+
+/* Base class for HIP queues. */
+class HIPDeviceQueue : public DeviceQueue {
+ public:
+  HIPDeviceQueue(HIPDevice *device);
+  ~HIPDeviceQueue();
+
+  virtual int num_concurrent_states(const size_t state_size) const override;
+  virtual int num_concurrent_busy_states() const override;
+
+  virtual void init_execution() override;
+
+  virtual bool kernel_available(DeviceKernel kernel) const override;
+
+  virtual bool enqueue(DeviceKernel kernel, const int work_size, void *args[]) override;
+
+  virtual bool synchronize() override;
+
+  virtual void zero_to_device(device_memory &mem) override;
+  virtual void copy_to_device(device_memory &mem) override;
+  virtual void copy_from_device(device_memory &mem) override;
+
+  virtual hipStream_t stream()
+  {
+    return hip_stream_;
+  }
+
+  // TODO : (Arya) Enable this after stabilizing the dev branch
+  virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create() override;
+
+ protected:
+  HIPDevice *hip_device_;
+  hipStream_t hip_stream_;
+};
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
--- a/intern/cycles/device/hip/util.cpp
+++ b/intern/cycles/device/hip/util.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef WITH_HIP
+
+#  include "device/hip/util.h"
+#  include "device/hip/device_impl.h"
+
+CCL_NAMESPACE_BEGIN
+
+HIPContextScope::HIPContextScope(HIPDevice *device) : device(device)
+{
+  hip_device_assert(device, hipCtxPushCurrent(device->hipContext));
+}
+
+HIPContextScope::~HIPContextScope()
+{
+  hip_device_assert(device, hipCtxPopCurrent(NULL));
+}
+
+#  ifndef WITH_HIP_DYNLOAD
+const char *hipewErrorString(hipError_t result)
+{
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
+}
+
+const char *hipewCompilerPath()
+{
+  return CYCLES_HIP_HIPCC_EXECUTABLE;
+}
+
+int hipewCompilerVersion()
+{
+  return (HIP_VERSION / 100) + (HIP_VERSION % 100 / 10);
+}
+#  endif
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
--- a/intern/cycles/device/hip/util.h
+++ b/intern/cycles/device/hip/util.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#ifdef WITH_HIP
+
+#  ifdef WITH_HIP_DYNLOAD
+#    include "hipew.h"
+#  endif
+
+CCL_NAMESPACE_BEGIN
+
+class HIPDevice;
+
+/* Utility to push/pop HIP context. */
+class HIPContextScope {
+ public:
+  HIPContextScope(HIPDevice *device);
+  ~HIPContextScope();
+
+ private:
+  HIPDevice *device;
+};
+
+/* Utility for checking return values of HIP function calls. */
+#  define hip_device_assert(hip_device, stmt) \
+    { \
+      hipError_t result = stmt; \
+      if (result != hipSuccess) { \
+        const char *name = hipewErrorString(result); \
+        hip_device->set_error( \
+            string_printf("%s in %s (%s:%d)", name, #stmt, __FILE__, __LINE__)); \
+      } \
+    } \
+    (void)0
+
+#  define hip_assert(stmt) hip_device_assert(this, stmt)
+
+#  ifndef WITH_HIP_DYNLOAD
+/* Transparently implement some functions, so majority of the file does not need
+ * to worry about difference between dynamically loaded and linked HIP at all. */
+const char *hipewErrorString(hipError_t result);
+const char *hipewCompilerPath();
+int hipewCompilerVersion();
+#  endif /* WITH_HIP_DYNLOAD */
+
+CCL_NAMESPACE_END
+
+#endif /* WITH_HIP */
--- a/intern/cycles/device/multi/device.cpp
+++ b/intern/cycles/device/multi/device.cpp
@@ -315,14 +315,14 @@ class MultiDevice : public Device {
    stats.mem_alloc(mem.device_size - existing_size);
  }

-  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem) override
+  void mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem) override
  {
    device_ptr key = mem.device_pointer;
-    int i = 0, sub_h = h / devices.size();
+    size_t i = 0, sub_h = h / devices.size();

    foreach (SubDevice &sub, devices) {
-      int sy = y + i * sub_h;
-      int sh = (i == (int)devices.size() - 1) ? h - sub_h * i : sub_h;
+      size_t sy = y + i * sub_h;
+      size_t sh = (i == (size_t)devices.size() - 1) ? h - sub_h * i : sub_h;

      SubDevice *owner_sub = find_matching_mem_device(key, sub);
      mem.device = owner_sub->device;
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -315,6 +315,11 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
  group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
  group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
  group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
+  group_descs[PG_HITV].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
+  group_descs[PG_HITV].hitgroup.moduleCH = optix_module;
+  group_descs[PG_HITV].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
+  group_descs[PG_HITV].hitgroup.moduleAH = optix_module;
+  group_descs[PG_HITV].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_volume_test";

  if (kernel_features & KERNEL_FEATURE_HAIR) {
    if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
@@ -397,6 +402,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
  trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
  trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
  trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
+  trace_css = std::max(trace_css, stack_size[PG_HITV].cssIS + stack_size[PG_HITV].cssAH);
  trace_css = std::max(trace_css,
                       stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
  trace_css = std::max(trace_css,
@@ -421,6 +427,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    pipeline_groups.push_back(groups[PG_HITD]);
    pipeline_groups.push_back(groups[PG_HITS]);
    pipeline_groups.push_back(groups[PG_HITL]);
+    pipeline_groups.push_back(groups[PG_HITV]);
    if (motion_blur) {
      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
@@ -459,6 +466,7 @@ bool OptiXDevice::load_kernels(const uint kernel_features)
    pipeline_groups.push_back(groups[PG_HITD]);
    pipeline_groups.push_back(groups[PG_HITS]);
    pipeline_groups.push_back(groups[PG_HITL]);
+    pipeline_groups.push_back(groups[PG_HITV]);
    if (motion_blur) {
      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
@@ -1390,25 +1398,33 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
      /* Set user instance ID to object index (but leave low bit blank). */
      instance.instanceId = ob->get_device_index() << 1;

-      /* Have to have at least one bit in the mask, or else instance would always be culled. */
-      instance.visibilityMask = 1;
+      /* Add some of the object visibility bits to the mask.
+       * __prim_visibility contains the combined visibility bits of all instances, so is not
+       * reliable if they differ between instances. But the OptiX visibility mask can only contain
+       * 8 bits, so have to trade-off here and select just a few important ones.
+       */
+      instance.visibilityMask = ob->visibility_for_tracing() & 0xFF;

-      if (ob->get_geometry()->has_volume) {
-        /* Volumes have a special bit set in the visibility mask so a trace can mask only volumes.
-         */
-        instance.visibilityMask |= 2;
+      /* Have to have at least one bit in the mask, or else instance would always be culled. */
+      if (0 == instance.visibilityMask) {
+        instance.visibilityMask = 0xFF;
      }

-      if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
-        /* Same applies to curves (so they can be skipped in local trace calls). */
-        instance.visibilityMask |= 4;
-
-        if (motion_blur && ob->get_geometry()->has_motion_blur() &&
-            static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+      if (ob->get_geometry()->geometry_type == Geometry::HAIR &&
+          static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
+        if (motion_blur && ob->get_geometry()->has_motion_blur()) {
          /* Select between motion blur and non-motion blur built-in intersection module. */
          instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
        }
      }
+      else {
+        /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves,
+         * since it needs to filter out endcaps there).
+         * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit
+         * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
+         */
+        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT;
+      }

      /* Insert motion traversable if object has motion. */
      if (motion_blur && ob->use_motion()) {
@@ -1474,7 +1490,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
        delete[] reinterpret_cast<uint8_t *>(&motion_transform);

        /* Disable instance transform if object uses motion transform already. */
-        instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+        instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;

        /* Get traversable handle to motion transform. */
        optixConvertPointerToTraversableHandle(context,
@@ -1491,7 +1507,7 @@ void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
        }
        else {
          /* Disable instance transform if geometry already has it applied to vertex data. */
-          instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
+          instance.flags |= OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
          /* Non-instanced objects read ID from 'prim_object', so distinguish
           * them from instanced objects with the low bit set. */
          instance.instanceId |= 1;
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -40,6 +40,7 @@ enum {
  PG_HITD, /* Default hit group. */
  PG_HITS, /* __SHADOW_RECORD_ALL__ hit group. */
  PG_HITL, /* __BVH_LOCAL__ hit group (only used for triangles). */
+  PG_HITV, /* __VOLUME__ hit group. */
  PG_HITD_MOTION,
  PG_HITS_MOTION,
  PG_CALL_SVM_AO,
@@ -51,7 +52,7 @@ enum {
 static const int MISS_PROGRAM_GROUP_OFFSET = PG_MISS;
 static const int NUM_MIS_PROGRAM_GROUPS = 1;
 static const int HIT_PROGAM_GROUP_OFFSET = PG_HITD;
-static const int NUM_HIT_PROGRAM_GROUPS = 5;
+static const int NUM_HIT_PROGRAM_GROUPS = 6;
 static const int CALLABLE_PROGRAM_GROUPS_BASE = PG_CALL_SVM_AO;
 static const int NUM_CALLABLE_PROGRAM_GROUPS = 3;

--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -244,7 +244,7 @@ static void foreach_sliced_buffer_params(const vector<unique_ptr<PathTraceWork>>
    const int slice_height = max(lround(height * weight), 1);

    /* Disallow negative values to deal with situations when there are more compute devices than
-     * scanlines. */
+     * scan-lines. */
    const int remaining_height = max(0, height - current_y);

    BufferParams slide_params = buffer_params;
@@ -801,7 +801,7 @@ void PathTrace::tile_buffer_write_to_disk()
  }

  if (!tile_manager_.write_tile(*buffers)) {
-    LOG(ERROR) << "Error writing tile to file.";
+    device_->set_error("Error writing tile to file");
  }
 }

@@ -894,7 +894,14 @@ void PathTrace::process_full_buffer_from_disk(string_view filename)

  DenoiseParams denoise_params;
  if (!tile_manager_.read_full_buffer_from_disk(filename, &full_frame_buffers, &denoise_params)) {
-    LOG(ERROR) << "Error reading tiles from file.";
+    const string error_message = "Error reading tiles from file";
+    if (progress_) {
+      progress_->set_error(error_message);
+      progress_->set_cancel(error_message);
+    }
+    else {
+      LOG(ERROR) << error_message;
+    }
    return;
  }

@@ -1028,6 +1035,8 @@ static const char *device_type_for_description(const DeviceType type)
      return "CUDA";
    case DEVICE_OPTIX:
      return "OptiX";
+    case DEVICE_HIP:
+      return "HIP";
    case DEVICE_DUMMY:
      return "Dummy";
    case DEVICE_MULTI:
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@@ -286,7 +286,7 @@ class PathTrace {
    /* Parameters of the big tile with the current resolution divider applied. */
    BufferParams effective_big_tile_params;

-    /* Denosier was run and there are denoised versions of the passes in the render buffers. */
+    /* Denoiser was run and there are denoised versions of the passes in the render buffers. */
    bool has_denoised_result = false;

    /* Current tile has been written (to either disk or callback.
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -104,7 +104,7 @@ class PathTraceWork {
   * - Copies work's render buffer to its device. */
  void copy_from_render_buffers(const RenderBuffers *render_buffers);

-  /* Special version of the `copy_from_render_buffers()` which only copies denosied passes from the
+  /* Special version of the `copy_from_render_buffers()` which only copies denoised passes from the
   * given render buffers, leaving rest of the passes.
   *
   * Same notes about device copying applies to this call as well. */
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -19,6 +19,8 @@
 #include "device/cpu/kernel.h"
 #include "device/device.h"

+#include "kernel/kernel_path_state.h"
+
 #include "integrator/pass_accessor_cpu.h"

 #include "render/buffers.h"
@@ -116,13 +118,17 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global
                                                    const KernelWorkTile &work_tile,
                                                    const int samples_num)
 {
-  const bool has_shadow_catcher = device_scene_->data.integrator.has_shadow_catcher;
  const bool has_bake = device_scene_->data.bake.use;

-  IntegratorStateCPU integrator_states[2] = {};
+  IntegratorStateCPU integrator_states[2];

  IntegratorStateCPU *state = &integrator_states[0];
-  IntegratorStateCPU *shadow_catcher_state = &integrator_states[1];
+  IntegratorStateCPU *shadow_catcher_state = nullptr;
+
+  if (device_scene_->data.integrator.has_shadow_catcher) {
+    shadow_catcher_state = &integrator_states[1];
+    path_state_init_queues(kernel_globals, shadow_catcher_state);
+  }

  KernelWorkTile sample_work_tile = work_tile;
  float *render_buffer = buffers_->buffer.data();
@@ -147,7 +153,7 @@ void PathTraceWorkCPU::render_samples_full_pipeline(KernelGlobals *kernel_global

    kernels_.integrator_megakernel(kernel_globals, state, render_buffer);

-    if (has_shadow_catcher) {
+    if (shadow_catcher_state) {
      kernels_.integrator_megakernel(kernel_globals, shadow_catcher_state, render_buffer);
    }

--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -95,8 +95,8 @@ void PathTraceWorkGPU::alloc_integrator_soa()
 #define KERNEL_STRUCT_END(name) \
  break; \
  }
-#define KERNEL_STRUCT_END_ARRAY(name, array_size) \
-  if (array_index == array_size - 1) { \
+#define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
+  if (array_index == gpu_array_size - 1) { \
    break; \
  } \
  }
@@ -738,7 +738,8 @@ void PathTraceWorkGPU::copy_to_gpu_display_naive(GPUDisplay *gpu_display,

  get_render_tile_film_pixels(destination, pass_mode, num_samples);

-  gpu_display_rgba_half_.copy_from_device();
+  queue_->copy_from_device(gpu_display_rgba_half_);
+  queue_->synchronize();

  gpu_display->copy_pixels_to_texture(
      gpu_display_rgba_half_.data(), texture_x, texture_y, width, height);
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -384,7 +384,7 @@ bool RenderScheduler::set_postprocess_render_work(RenderWork *render_work)
  }

  if (denoiser_params_.use && !state_.last_work_tile_was_denoised) {
-    render_work->tile.denoise = true;
+    render_work->tile.denoise = !tile_manager_.has_multiple_tiles();
    any_scheduled = true;
  }

@@ -903,6 +903,12 @@ bool RenderScheduler::work_need_denoise(bool &delayed, bool &ready_to_display)
    return false;
  }

+  /* When multiple tiles are used the full frame will be denoised.
+   * Avoid per-tile denoising to save up render time. */
+  if (tile_manager_.has_multiple_tiles()) {
+    return false;
+  }
+
  if (done()) {
    /* Always denoise at the last sample. */
    return true;
--- a/intern/cycles/integrator/render_scheduler.h
+++ b/intern/cycles/integrator/render_scheduler.h
@@ -31,7 +31,7 @@ class RenderWork {
  int resolution_divider = 1;

  /* Initialize render buffers.
-   * Includes steps like zero-ing the buffer on the device, and optional reading of pixels from the
+   * Includes steps like zeroing the buffer on the device, and optional reading of pixels from the
   * baking target. */
  bool init_render_buffers = false;

--- a/intern/cycles/integrator/shader_eval.cpp
+++ b/intern/cycles/integrator/shader_eval.cpp
@@ -149,14 +149,14 @@ bool ShaderEval::eval_gpu(Device *device,

  /* Execute work on GPU in chunk, so we can cancel.
   * TODO : query appropriate size from device.*/
-  const int chunk_size = 65536;
+  const int64_t chunk_size = 65536;

-  const int work_size = output.size();
+  const int64_t work_size = output.size();
  void *d_input = (void *)input.device_pointer;
  void *d_output = (void *)output.device_pointer;

-  for (int d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
-    int d_work_size = min(chunk_size, work_size - d_offset);
+  for (int64_t d_offset = 0; d_offset < work_size; d_offset += chunk_size) {
+    int64_t d_work_size = std::min(chunk_size, work_size - d_offset);
    void *args[] = {&d_input, &d_output, &d_offset, &d_work_size};

    queue->enqueue(kernel, d_work_size, args);
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -35,6 +35,10 @@ set(SRC_DEVICE_CUDA
  device/cuda/kernel.cu
 )

+set(SRC_DEVICE_HIP
+  device/hip/kernel.cpp
+)
+
 set(SRC_DEVICE_OPTIX
  device/optix/kernel.cu
  device/optix/kernel_shader_raytrace.cu
@@ -106,6 +110,12 @@ set(SRC_DEVICE_CUDA_HEADERS
  device/cuda/globals.h
 )

+set(SRC_DEVICE_HIP_HEADERS
+  device/hip/compat.h
+  device/hip/config.h
+  device/hip/globals.h
+)
+
 set(SRC_DEVICE_OPTIX_HEADERS
  device/optix/compat.h
  device/optix/globals.h
@@ -458,6 +468,104 @@ if(WITH_CYCLES_CUDA_BINARIES)
  cycles_set_solution_folder(cycles_kernel_cuda)
 endif()

+####################################################### START
+
+# HIP module
+
+if(WITH_CYCLES_HIP_BINARIES)
+  # 64 bit only
+  set(HIP_BITS 64)
+
+  # HIP version
+  execute_process(COMMAND ${HIP_HIPCC_EXECUTABLE} "--version" OUTPUT_VARIABLE HIPCC_OUT)
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\1" HIP_VERSION_MAJOR "${HIPCC_OUT}")
+  string(REGEX REPLACE ".*release ([0-9]+)\\.([0-9]+).*" "\\2" HIP_VERSION_MINOR "${HIPCC_OUT}")
+  set(HIP_VERSION "${HIP_VERSION_MAJOR}${HIP_VERSION_MINOR}")
+
+
+  message(WARNING
+    "HIP version ${HIP_VERSION_MAJOR}.${HIP_VERSION_MINOR} detected")
+
+  # build for each arch
+  set(hip_sources device/hip/kernel.cpp
+    ${SRC_HEADERS}
+    ${SRC_DEVICE_HIP_HEADERS}
+    ${SRC_BVH_HEADERS}
+    ${SRC_SVM_HEADERS}
+    ${SRC_GEOM_HEADERS}
+    ${SRC_INTEGRATOR_HEADERS}
+    ${SRC_CLOSURE_HEADERS}
+    ${SRC_UTIL_HEADERS}
+  )
+  set(hip_fatbins)
+
+  macro(CYCLES_HIP_KERNEL_ADD arch prev_arch name flags sources experimental)
+    if(${arch} MATCHES "compute_.*")
+      set(format "ptx")
+    else()
+      set(format "fatbin")
+    endif()
+    set(hip_file ${name}_${arch}.${format})
+
+    set(kernel_sources ${sources})
+    if(NOT ${prev_arch} STREQUAL "none")
+      if(${prev_arch} MATCHES "compute_.*")
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.ptx)
+      else()
+        set(kernel_sources ${kernel_sources} ${name}_${prev_arch}.fatbin)
+      endif()
+    endif()
+
+    set(hip_kernel_src "/device/hip/${name}.cpp")
+
+    set(hip_flags ${flags}
+      -D CCL_NAMESPACE_BEGIN=
+      -D CCL_NAMESPACE_END=
+      -D HIPCC
+      -m ${HIP_BITS}
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/..
+      -I ${CMAKE_CURRENT_SOURCE_DIR}/device/hip
+      --use_fast_math
+      -o ${CMAKE_CURRENT_BINARY_DIR}/${hip_file})
+
+    if(${experimental})
+      set(hip_flags ${hip_flags} -D __KERNEL_EXPERIMENTAL__)
+      set(name ${name}_experimental)
+    endif()
+
+    if(WITH_CYCLES_DEBUG)
+      set(hip_flags ${hip_flags} -D __KERNEL_DEBUG__)
+    endif()
+
+    if(WITH_NANOVDB)
+      set(hip_flags ${hip_flags}
+        -D WITH_NANOVDB
+        -I "${NANOVDB_INCLUDE_DIR}")
+    endif()
+  endmacro()
+
+  set(prev_arch "none")
+  foreach(arch ${CYCLES_HIP_BINARIES_ARCH})
+      set(hip_hipcc_executable ${HIP_HIPCC_EXECUTABLE})
+      set(hip_toolkit_root_dir ${HIP_TOOLKIT_ROOT_DIR})
+    if(DEFINED hip_hipcc_executable AND DEFINED hip_toolkit_root_dir)
+      # Compile regular kernel
+      CYCLES_HIP_KERNEL_ADD(${arch} ${prev_arch} kernel "" "${hip_sources}" FALSE)
+
+      if(WITH_CYCLES_HIP_BUILD_SERIAL)
+        set(prev_arch ${arch})
+      endif()
+
+      unset(hip_hipcc_executable)
+      unset(hip_toolkit_root_dir)
+    endif()
+  endforeach()
+
+  add_custom_target(cycles_kernel_hip ALL DEPENDS ${hip_fatbins})
+  cycles_set_solution_folder(cycles_kernel_hip)
+endif()
+
+####################################################### END
 # OptiX PTX modules

 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
@@ -602,11 +710,13 @@ endif()
 cycles_add_library(cycles_kernel "${LIB}"
  ${SRC_DEVICE_CPU}
  ${SRC_DEVICE_CUDA}
+  ${SRC_DEVICE_HIP}
  ${SRC_DEVICE_OPTIX}
  ${SRC_HEADERS}
  ${SRC_DEVICE_CPU_HEADERS}
  ${SRC_DEVICE_GPU_HEADERS}
  ${SRC_DEVICE_CUDA_HEADERS}
+  ${SRC_DEVICE_HIP_HEADERS}
  ${SRC_DEVICE_OPTIX_HEADERS}
  ${SRC_BVH_HEADERS}
  ${SRC_CLOSURE_HEADERS}
@@ -621,6 +731,7 @@ source_group("geom" FILES ${SRC_GEOM_HEADERS})
 source_group("integrator" FILES ${SRC_INTEGRATOR_HEADERS})
 source_group("kernel" FILES ${SRC_HEADERS})
 source_group("device\\cpu" FILES ${SRC_DEVICE_CPU} ${SRC_DEVICE_CPU_HEADERS})
+source_group("device\\hip" FILES ${SRC_DEVICE_HIP} ${SRC_DEVICE_HIP_HEADERS})
 source_group("device\\gpu" FILES ${SRC_DEVICE_GPU_HEADERS})
 source_group("device\\cuda" FILES ${SRC_DEVICE_CUDA} ${SRC_DEVICE_CUDA_HEADERS})
 source_group("device\\optix" FILES ${SRC_DEVICE_OPTIX} ${SRC_DEVICE_OPTIX_HEADERS})
@@ -632,14 +743,19 @@ endif()
 if(WITH_CYCLES_DEVICE_OPTIX AND WITH_CYCLES_CUDA_BINARIES)
  add_dependencies(cycles_kernel cycles_kernel_optix)
 endif()
+if(WITH_CYCLES_HIP)
+  add_dependencies(cycles_kernel cycles_kernel_hip)
+endif()

 # Install kernel source for runtime compilation

 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_GPU_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/gpu)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_CUDA_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/cuda)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_HIP_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/hip)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_DEVICE_OPTIX_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/device/optix)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_BVH_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/bvh)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/source/kernel/closure)
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -167,15 +167,25 @@ ccl_device_intersect bool scene_intersect(const KernelGlobals *kg,
  uint p4 = visibility;
  uint p5 = PRIMITIVE_NONE;

+  uint ray_mask = visibility & 0xFF;
+  uint ray_flags = OPTIX_RAY_FLAG_NONE;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+    ray_flags = OPTIX_RAY_FLAG_ENFORCE_ANYHIT;
+  }
+  else if (visibility & PATH_RAY_SHADOW_OPAQUE) {
+    ray_flags = OPTIX_RAY_FLAG_TERMINATE_ON_FIRST_HIT;
+  }
+
  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
             ray->P,
             ray->D,
             0.0f,
             ray->t,
             ray->time,
-             0xF,
-             OPTIX_RAY_FLAG_NONE,
-             0,  // SBT offset for PG_HITD
+             ray_mask,
+             ray_flags,
+             0, /* SBT offset for PG_HITD */
             0,
             0,
             p0,
@@ -251,11 +261,11 @@ ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
  uint p2 = ((uint64_t)local_isect) & 0xFFFFFFFF;
  uint p3 = (((uint64_t)local_isect) >> 32) & 0xFFFFFFFF;
  uint p4 = local_object;
-  // Is set to zero on miss or if ray is aborted, so can be used as return value
+  /* Is set to zero on miss or if ray is aborted, so can be used as return value. */
  uint p5 = max_hits;

  if (local_isect) {
-    local_isect->num_hits = 0;  // Initialize hit count to zero
+    local_isect->num_hits = 0; /* Initialize hit count to zero. */
  }
  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
             ray->P,
@@ -263,11 +273,10 @@ ccl_device_intersect bool scene_intersect_local(const KernelGlobals *kg,
             0.0f,
             ray->t,
             ray->time,
-             // Skip curves
-             0x3,
-             // Need to always call into __anyhit__kernel_optix_local_hit
+             0xFF,
+             /* Need to always call into __anyhit__kernel_optix_local_hit. */
             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             2,  // SBT offset for PG_HITL
+             2, /* SBT offset for PG_HITL */
             0,
             0,
             p0,
@@ -365,17 +374,22 @@ ccl_device_intersect bool scene_intersect_shadow_all(const KernelGlobals *kg,
  uint p4 = visibility;
  uint p5 = false;

-  *num_hits = 0;  // Initialize hit count to zero
+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
+  *num_hits = 0; /* Initialize hit count to zero. */
  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
             ray->P,
             ray->D,
             0.0f,
             ray->t,
             ray->time,
-             0xF,
-             // Need to always call into __anyhit__kernel_optix_shadow_all_hit
+             ray_mask,
+             /* Need to always call into __anyhit__kernel_optix_shadow_all_hit. */
             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
-             1,  // SBT offset for PG_HITS
+             1, /* SBT offset for PG_HITS */
             0,
             0,
             p0,
@@ -444,16 +458,21 @@ ccl_device_intersect bool scene_intersect_volume(const KernelGlobals *kg,
  uint p4 = visibility;
  uint p5 = PRIMITIVE_NONE;

+  uint ray_mask = visibility & 0xFF;
+  if (0 == ray_mask && (visibility & ~0xFF) != 0) {
+    ray_mask = 0xFF;
+  }
+
  optixTrace(scene_intersect_valid(ray) ? kernel_data.bvh.scene : 0,
             ray->P,
             ray->D,
             0.0f,
             ray->t,
             ray->time,
-             // Skip everything but volumes
-             0x2,
-             OPTIX_RAY_FLAG_NONE,
-             0,  // SBT offset for PG_HITD
+             ray_mask,
+             /* Need to always call into __anyhit__kernel_optix_volume_test. */
+             OPTIX_RAY_FLAG_ENFORCE_ANYHIT,
+             3, /* SBT offset for PG_HITV */
             0,
             0,
             p0,
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -21,11 +21,15 @@ CCL_NAMESPACE_BEGIN
 /* Given an array of states, build an array of indices for which the states
 * are active.
 *
- * Shared memory requirement is sizeof(int) * (number_of_warps + 1) */
+ * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */

 #include "util/util_atomic.h"

-#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif

 template<uint blocksize, typename IsActiveOp>
 __device__ void gpu_parallel_active_index_array(const uint num_states,
--- a/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
+++ b/intern/cycles/kernel/device/gpu/parallel_prefix_sum.h
@@ -27,7 +27,11 @@ CCL_NAMESPACE_BEGIN

 #include "util/util_atomic.h"

-#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE 512
+#endif

 template<uint blocksize> __device__ void gpu_parallel_prefix_sum(int *values, const int num_values)
 {
--- a/intern/cycles/kernel/device/gpu/parallel_reduce.h
+++ b/intern/cycles/kernel/device/gpu/parallel_reduce.h
@@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN
 * the overall cost of the algorithm while keeping the work complexity O(n) and
 * the step complexity O(log n). (Brent's Theorem optimization) */

-#define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_SUM_DEFAULT_BLOCK_SIZE 512
+#endif

 template<uint blocksize, typename InputT, typename OutputT, typename ConvertOp>
 __device__ void gpu_parallel_sum(
--- a/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_sorted_index.h
@@ -26,7 +26,11 @@ CCL_NAMESPACE_BEGIN

 #include "util/util_atomic.h"

-#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#ifdef __HIP__
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 1024
+#else
+#  define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE 512
+#endif
 #define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY (~0)

 template<uint blocksize, typename GetKeyOp>
--- a/intern/cycles/kernel/device/hip/compat.h
+++ b/intern/cycles/kernel/device/hip/compat.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#define __KERNEL_GPU__
+#define __KERNEL_HIP__
+#define CCL_NAMESPACE_BEGIN
+#define CCL_NAMESPACE_END
+
+#ifndef ATTR_FALLTHROUGH
+#  define ATTR_FALLTHROUGH
+#endif
+
+#ifdef __HIPCC_RTC__
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+#else
+#  include <stdint.h>
+#endif
+
+#ifdef CYCLES_HIPBIN_CC
+#  define FLT_MIN 1.175494350822287507969e-38f
+#  define FLT_MAX 340282346638528859811704183484516925440.0f
+#  define FLT_EPSILON 1.192092896e-07F
+#endif
+
+/* Qualifiers */
+
+#define ccl_device __device__ __inline__
+#define ccl_device_inline __device__ __inline__
+#define ccl_device_forceinline __device__ __forceinline__
+#define ccl_device_noinline __device__ __noinline__
+#define ccl_device_noinline_cpu ccl_device
+#define ccl_global
+#define ccl_static_constant __constant__
+#define ccl_device_constant __constant__ __device__
+#define ccl_constant const
+#define ccl_gpu_shared __shared__
+#define ccl_private
+#define ccl_may_alias
+#define ccl_addr_space
+#define ccl_restrict __restrict__
+#define ccl_loop_no_unroll
+#define ccl_align(n) __align__(n)
+#define ccl_optional_struct_init
+
+#define kernel_assert(cond)
+
+/* Types */
+#ifdef __HIP__
+#  include "hip/hip_fp16.h"
+#  include "hip/hip_runtime.h"
+#endif
+
+#ifdef _MSC_VER
+#  include <immintrin.h>
+#endif
+
+#define ccl_gpu_thread_idx_x (threadIdx.x)
+#define ccl_gpu_block_dim_x (blockDim.x)
+#define ccl_gpu_block_idx_x (blockIdx.x)
+#define ccl_gpu_grid_dim_x (gridDim.x)
+#define ccl_gpu_warp_size (warpSize)
+
+#define ccl_gpu_global_id_x() (ccl_gpu_block_idx_x * ccl_gpu_block_dim_x + ccl_gpu_thread_idx_x)
+#define ccl_gpu_global_size_x() (ccl_gpu_grid_dim_x * ccl_gpu_block_dim_x)
+
+/* GPU warp synchronization */
+
+#define ccl_gpu_syncthreads() __syncthreads()
+#define ccl_gpu_ballot(predicate) __ballot(predicate)
+#define ccl_gpu_shfl_down_sync(mask, var, detla) __shfl_down(var, detla)
+#define ccl_gpu_popc(x) __popc(x)
+
+/* GPU texture objects */
+typedef hipTextureObject_t ccl_gpu_tex_object;
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_2D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y)
+{
+  return tex2D<T>(texobj, x, y);
+}
+
+template<typename T>
+ccl_device_forceinline T ccl_gpu_tex_object_read_3D(const ccl_gpu_tex_object texobj,
+                                                    const float x,
+                                                    const float y,
+                                                    const float z)
+{
+  return tex3D<T>(texobj, x, y, z);
+}
+
+/* Use fast math functions */
+
+#define cosf(x) __cosf(((float)(x)))
+#define sinf(x) __sinf(((float)(x)))
+#define powf(x, y) __powf(((float)(x)), ((float)(y)))
+#define tanf(x) __tanf(((float)(x)))
+#define logf(x) __logf(((float)(x)))
+#define expf(x) __expf(((float)(x)))
+
+/* Types */
+
+#include "util/util_half.h"
+#include "util/util_types.h"
--- a/intern/cycles/kernel/device/hip/config.h
+++ b/intern/cycles/kernel/device/hip/config.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Device data taken from HIP occupancy calculator.
+ *
+ * Terminology
+ * - HIP GPUs have multiple streaming multiprocessors
+ * - Each multiprocessor executes multiple thread blocks
+ * - Each thread block contains a number of threads, also known as the block size
+ * - Multiprocessors have a fixed number of registers, and the amount of registers
+ *   used by each threads limits the number of threads per block.
+ */
+
+/* Launch Bound Definitions */
+#define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
+#define GPU_MULTIPROCESSOR_MAX_BLOCKS 64
+#define GPU_BLOCK_MAX_THREADS 1024
+#define GPU_THREAD_MAX_REGISTERS 255
+
+#define GPU_KERNEL_BLOCK_NUM_THREADS 1024
+#define GPU_KERNEL_MAX_REGISTERS 64
+
+/* Compute number of threads per block and minimum blocks per multiprocessor
+ * given the maximum number of registers per thread. */
+
+#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
+  extern "C" __global__ void __launch_bounds__(block_num_threads, \
+                                               GPU_MULTIPRESSOR_MAX_REGISTERS / \
+                                                   (block_num_threads * thread_num_registers))
+
+/* sanity checks */
+
+#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
+#  error "Maximum number of threads per block exceeded"
+#endif
+
+#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
+    GPU_MULTIPROCESSOR_MAX_BLOCKS
+#  error "Maximum number of blocks per multiprocessor exceeded"
+#endif
+
+#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
+#  error "Maximum number of registers per thread exceeded"
+#endif
--- a/intern/cycles/kernel/device/hip/globals.h
+++ b/intern/cycles/kernel/device/hip/globals.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Constant Globals */
+
+#pragma once
+
+#include "kernel/kernel_profiling.h"
+#include "kernel/kernel_types.h"
+
+#include "kernel/integrator/integrator_state.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Not actually used, just a NULL pointer that gets passed everywhere, which we
+ * hope gets optimized out by the compiler. */
+struct KernelGlobals {
+  /* NOTE: Keep the size in sync with SHADOW_STACK_MAX_HITS. */
+  int unused[1];
+};
+
+/* Global scene data and textures */
+__constant__ KernelData __data;
+#define KERNEL_TEX(type, name) __attribute__((used)) const __constant__ __device__ type *name;
+#include "kernel/kernel_textures.h"
+
+/* Integrator state */
+__constant__ IntegratorStateGPU __integrator_state;
+
+/* Abstraction macros */
+#define kernel_data __data
+#define kernel_tex_fetch(t, index) t[(index)]
+#define kernel_tex_array(t) (t)
+#define kernel_integrator_state __integrator_state
+
+CCL_NAMESPACE_END
--- a/intern/cycles/kernel/device/hip/kernel.cpp
+++ b/intern/cycles/kernel/device/hip/kernel.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2011-2021 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* HIP kernel entry points */
+
+#ifdef __HIP_DEVICE_COMPILE__
+
+#  include "kernel/device/hip/compat.h"
+#  include "kernel/device/hip/config.h"
+#  include "kernel/device/hip/globals.h"
+
+#  include "kernel/device/gpu/image.h"
+#  include "kernel/device/gpu/kernel.h"
+
+#endif
--- a/intern/cycles/kernel/device/optix/kernel.cu
+++ b/intern/cycles/kernel/device/optix/kernel.cu
@@ -19,7 +19,7 @@
 #include "kernel/device/optix/compat.h"
 #include "kernel/device/optix/globals.h"

-#include "kernel/device/gpu/image.h"  // Texture lookup uses normal CUDA intrinsics
+#include "kernel/device/gpu/image.h"  /* Texture lookup uses normal CUDA intrinsics. */

 #include "kernel/integrator/integrator_state.h"
 #include "kernel/integrator/integrator_state_flow.h"
@@ -44,18 +44,18 @@ template<typename T> ccl_device_forceinline T *get_payload_ptr_2()
 template<bool always = false> ccl_device_forceinline uint get_object_id()
 {
 #ifdef __OBJECT_MOTION__
-  // Always get the the instance ID from the TLAS
-  // There might be a motion transform node between TLAS and BLAS which does not have one
+  /* Always get the the instance ID from the TLAS.
+   * There might be a motion transform node between TLAS and BLAS which does not have one. */
  uint object = optixGetInstanceIdFromHandle(optixGetTransformListHandle(0));
 #else
  uint object = optixGetInstanceId();
 #endif
-  // Choose between always returning object ID or only for instances
+  /* Choose between always returning object ID or only for instances. */
  if (always || (object & 1) == 0)
-    // Can just remove the low bit since instance always contains object ID
+    /* Can just remove the low bit since instance always contains object ID. */
    return object >> 1;
  else
-    // Set to OBJECT_NONE if this is not an instanced object
+    /* Set to OBJECT_NONE if this is not an instanced object. */
    return OBJECT_NONE;
 }

@@ -93,23 +93,30 @@ extern "C" __global__ void __raygen__kernel_optix_integrator_intersect_volume_st

 extern "C" __global__ void __miss__kernel_optix_miss()
 {
-  // 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss
+  /* 'kernel_path_lamp_emission' checks intersection distance, so need to set it even on a miss. */
  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));
  optixSetPayload_5(PRIMITIVE_NONE);
 }

 extern "C" __global__ void __anyhit__kernel_optix_local_hit()
 {
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
 #ifdef __BVH_LOCAL__
  const uint object = get_object_id<true>();
  if (object != optixGetPayload_4() /* local_object */) {
-    // Only intersect with matching object
+    /* Only intersect with matching object. */
    return optixIgnoreIntersection();
  }

  const uint max_hits = optixGetPayload_5();
  if (max_hits == 0) {
-    // Special case for when no hit information is requested, just report that something was hit
+    /* Special case for when no hit information is requested, just report that something was hit */
    optixSetPayload_5(true);
    return optixTerminateRay();
  }
@@ -136,8 +143,9 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
  }
  else {
    if (local_isect->num_hits && optixGetRayTmax() > local_isect->hits[0].t) {
-      // Record closest intersection only
-      // Do not terminate ray here, since there is no guarantee about distance ordering in any-hit
+      /* Record closest intersection only.
+       * Do not terminate ray here, since there is no guarantee about distance ordering in any-hit.
+       */
      return optixIgnoreIntersection();
    }

@@ -154,14 +162,14 @@ extern "C" __global__ void __anyhit__kernel_optix_local_hit()
  isect->u = 1.0f - barycentrics.y - barycentrics.x;
  isect->v = barycentrics.x;

-  // Record geometric normal
+  /* Record geometric normal. */
  const uint tri_vindex = kernel_tex_fetch(__prim_tri_index, isect->prim);
  const float3 tri_a = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 0));
  const float3 tri_b = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 1));
  const float3 tri_c = float4_to_float3(kernel_tex_fetch(__prim_tri_verts, tri_vindex + 2));
  local_isect->Ng[hit] = normalize(cross(tri_b - tri_a, tri_c - tri_a));

-  // Continue tracing (without this the trace call would return after the first hit)
+  /* Continue tracing (without this the trace call would return after the first hit). */
  optixIgnoreIntersection();
 #endif
 }
@@ -190,7 +198,7 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
    u = __uint_as_float(optixGetAttribute_0());
    v = __uint_as_float(optixGetAttribute_1());

-    // Filter out curve endcaps
+    /* Filter out curve endcaps. */
    if (u == 0.0f || u == 1.0f) {
      ignore_intersection = true;
    }
@@ -241,10 +249,10 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
    isect->type = kernel_tex_fetch(__prim_type, prim);

 #  ifdef __TRANSPARENT_SHADOWS__
-    // Detect if this surface has a shader with transparent shadows
+    /* Detect if this surface has a shader with transparent shadows. */
    if (!shader_transparent_shadow(NULL, isect) || max_hits == 0) {
 #  endif
-      // If no transparent shadows, all light is blocked and we can stop immediately
+      /* If no transparent shadows, all light is blocked and we can stop immediately. */
      optixSetPayload_5(true);
      return optixTerminateRay();
 #  ifdef __TRANSPARENT_SHADOWS__
@@ -252,24 +260,39 @@ extern "C" __global__ void __anyhit__kernel_optix_shadow_all_hit()
 #  endif
  }

-  // Continue tracing
+  /* Continue tracing. */
  optixIgnoreIntersection();
 #endif
 }

-extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+extern "C" __global__ void __anyhit__kernel_optix_volume_test()
 {
-  uint visibility = optixGetPayload_4();
+#ifdef __HAIR__
+  if (!optixIsTriangleHit()) {
+    /* Ignore curves. */
+    return optixIgnoreIntersection();
+  }
+#endif
+
 #ifdef __VISIBILITY_FLAG__
  const uint prim = optixGetPrimitiveIndex();
+  const uint visibility = optixGetPayload_4();
  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
    return optixIgnoreIntersection();
  }
 #endif

+  const uint object = get_object_id<true>();
+  if ((kernel_tex_fetch(__object_flag, object) & SD_OBJECT_HAS_VOLUME) == 0) {
+    return optixIgnoreIntersection();
+  }
+}
+
+extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
+{
 #ifdef __HAIR__
  if (!optixIsTriangleHit()) {
-    // Filter out curve endcaps
+    /* Filter out curve endcaps. */
    const float u = __uint_as_float(optixGetAttribute_0());
    if (u == 0.0f || u == 1.0f) {
      return optixIgnoreIntersection();
@@ -277,18 +300,26 @@ extern "C" __global__ void __anyhit__kernel_optix_visibility_test()
  }
 #endif

-  // Shadow ray early termination
+#ifdef __VISIBILITY_FLAG__
+  const uint prim = optixGetPrimitiveIndex();
+  const uint visibility = optixGetPayload_4();
+  if ((kernel_tex_fetch(__prim_visibility, prim) & visibility) == 0) {
+    return optixIgnoreIntersection();
+  }
+
+  /* Shadow ray early termination. */
  if (visibility & PATH_RAY_SHADOW_OPAQUE) {
    return optixTerminateRay();
  }
+#endif
 }

 extern "C" __global__ void __closesthit__kernel_optix_hit()
 {
-  optixSetPayload_0(__float_as_uint(optixGetRayTmax()));  // Intersection distance
+  optixSetPayload_0(__float_as_uint(optixGetRayTmax())); /* Intersection distance */
  optixSetPayload_3(optixGetPrimitiveIndex());
  optixSetPayload_4(get_object_id());
-  // Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index
+  /* Can be PRIMITIVE_TRIANGLE and PRIMITIVE_MOTION_TRIANGLE or curve type and segment index. */
  optixSetPayload_5(kernel_tex_fetch(__prim_type, optixGetPrimitiveIndex()));

  if (optixIsTriangleHit()) {
@@ -297,7 +328,7 @@ extern "C" __global__ void __closesthit__kernel_optix_hit()
    optixSetPayload_2(__float_as_uint(barycentrics.x));
  }
  else {
-    optixSetPayload_1(optixGetAttribute_0());  // Same as 'optixGetCurveParameter()'
+    optixSetPayload_1(optixGetAttribute_0()); /* Same as 'optixGetCurveParameter()' */
    optixSetPayload_2(optixGetAttribute_1());
  }
 }
@@ -311,7 +342,7 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type
  float3 P = optixGetObjectRayOrigin();
  float3 dir = optixGetObjectRayDirection();

-  // The direction is not normalized by default, but the curve intersection routine expects that
+  /* The direction is not normalized by default, but the curve intersection routine expects that */
  float len;
  dir = normalize_len(dir, &len);

@@ -323,15 +354,15 @@ ccl_device_inline void optix_intersection_curve(const uint prim, const uint type

  Intersection isect;
  isect.t = optixGetRayTmax();
-  // Transform maximum distance into object space
+  /* Transform maximum distance into object space. */
  if (isect.t != FLT_MAX)
    isect.t *= len;

  if (curve_intersect(NULL, &isect, P, dir, isect.t, visibility, object, prim, time, type)) {
    optixReportIntersection(isect.t / len,
                            type & PRIMITIVE_ALL,
-                            __float_as_int(isect.u),   // Attribute_0
-                            __float_as_int(isect.v));  // Attribute_1
+                            __float_as_int(isect.u),  /* Attribute_0 */
+                            __float_as_int(isect.v)); /* Attribute_1 */
  }
 }

--- a/intern/cycles/kernel/geom/geom_curve_intersect.h
+++ b/intern/cycles/kernel/geom/geom_curve_intersect.h
@@ -713,7 +713,7 @@ ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,

    P = transform_point(&tfm, P);
    D = transform_direction(&tfm, D * t);
-    D = normalize_len(D, &t);
+    D = safe_normalize_len(D, &t);
  }

  int prim = kernel_tex_fetch(__prim_index, isect_prim);
@@ -764,8 +764,10 @@ ccl_device_inline void curve_shader_setup(const KernelGlobals *kg,
    /* Thick curves, compute normal using direction from inside the curve.
     * This could be optimized by recording the normal in the intersection,
     * however for Optix this would go beyond the size of the payload. */
+    /* NOTE: It is possible that P will be the same as P_inside (precision issues, or very small
+     * radius). In this case use the view direction to approximate the normal. */
    const float3 P_inside = float4_to_float3(catmull_rom_basis_eval(P_curve, sd->u));
-    const float3 Ng = normalize(P - P_inside);
+    const float3 Ng = (!isequal_float3(P, P_inside)) ? normalize(P - P_inside) : -sd->I;

    sd->N = Ng;
    sd->Ng = Ng;
--- a/intern/cycles/kernel/geom/geom_motion_triangle.h
+++ b/intern/cycles/kernel/geom/geom_motion_triangle.h
@@ -41,7 +41,18 @@ ccl_device_inline int find_attribute_motion(const KernelGlobals *kg,
  uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);

  while (attr_map.x != id) {
-    attr_offset += ATTR_PRIM_TYPES;
+    if (UNLIKELY(attr_map.x == ATTR_STD_NONE)) {
+      if (UNLIKELY(attr_map.y == 0)) {
+        return (int)ATTR_STD_NOT_FOUND;
+      }
+      else {
+        /* Chain jump to a different part of the table. */
+        attr_offset = attr_map.z;
+      }
+    }
+    else {
+      attr_offset += ATTR_PRIM_TYPES;
+    }
    attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
  }

--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -365,19 +365,16 @@ ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,
 #ifdef __VOLUME__
  if (!(sd.flag & SD_HAS_ONLY_VOLUME)) {
 #endif
+    const int path_flag = INTEGRATOR_STATE(path, flag);

-    {
-      const int path_flag = INTEGRATOR_STATE(path, flag);
 #ifdef __SUBSURFACE__
-      /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
-      if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
+    /* Can skip shader evaluation for BSSRDF exit point without bump mapping. */
+    if (!(path_flag & PATH_RAY_SUBSURFACE) || ((sd.flag & SD_HAS_BSSRDF_BUMP)))
 #endif
-      {
-        /* Evaluate shader. */
-        PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
-        shader_eval_surface<node_feature_mask>(
-            INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
-      }
+    {
+      /* Evaluate shader. */
+      PROFILING_EVENT(PROFILING_SHADE_SURFACE_EVAL);
+      shader_eval_surface<node_feature_mask>(INTEGRATOR_STATE_PASS, &sd, render_buffer, path_flag);
    }

 #ifdef __SUBSURFACE__
@@ -417,17 +414,20 @@ ccl_device bool integrate_surface(INTEGRATOR_STATE_ARGS,

    /* Perform path termination. Most paths have already been terminated in
     * the intersect_closest kernel, this is just for emission and for dividing
-     * throughput by the probability at the right moment. */
-    const int path_flag = INTEGRATOR_STATE(path, flag);
-    const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
-                                  0.0f :
-                                  path_state_continuation_probability(INTEGRATOR_STATE_PASS,
-                                                                      path_flag);
-    if (probability == 0.0f) {
-      return false;
-    }
-    else if (probability != 1.0f) {
-      INTEGRATOR_STATE_WRITE(path, throughput) /= probability;
+     * throughput by the probability at the right moment.
+     *
+     * Also ensure we don't do it twice for SSS at both the entry and exit point. */
+    if (!(path_flag & PATH_RAY_SUBSURFACE)) {
+      const float probability = (path_flag & PATH_RAY_TERMINATE_ON_NEXT_SURFACE) ?
+                                    0.0f :
+                                    path_state_continuation_probability(INTEGRATOR_STATE_PASS,
+                                                                        path_flag);
+      if (probability == 0.0f) {
+        return false;
+      }
+      else if (probability != 1.0f) {
+        INTEGRATOR_STATE_WRITE(path, throughput) /= probability;
+      }
    }

 #ifdef __DENOISING_FEATURES__
--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -74,7 +74,7 @@ ccl_device_inline bool shadow_volume_shader_sample(INTEGRATOR_STATE_ARGS,
                                                   ShaderData *ccl_restrict sd,
                                                   float3 *ccl_restrict extinction)
 {
-  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
+  shader_eval_volume<true>(INTEGRATOR_STATE_PASS, sd, PATH_RAY_SHADOW, [=](const int i) {
    return integrator_state_read_shadow_volume_stack(INTEGRATOR_STATE_PASS, i);
  });

@@ -93,7 +93,7 @@ ccl_device_inline bool volume_shader_sample(INTEGRATOR_STATE_ARGS,
                                            VolumeShaderCoefficients *coeff)
 {
  const int path_flag = INTEGRATOR_STATE(path, flag);
-  shader_eval_volume(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
+  shader_eval_volume<false>(INTEGRATOR_STATE_PASS, sd, path_flag, [=](const int i) {
    return integrator_state_read_volume_stack(INTEGRATOR_STATE_PASS, i);
  });

--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -60,7 +60,15 @@ CCL_NAMESPACE_BEGIN
 * TODO: these could be made dynamic depending on the features used in the scene. */

 #define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
-#define INTEGRATOR_SHADOW_ISECT_SIZE 4
+
+#define INTEGRATOR_SHADOW_ISECT_SIZE_CPU 1024
+#define INTEGRATOR_SHADOW_ISECT_SIZE_GPU 4
+
+#ifdef __KERNEL_CPU__
+#  define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_CPU
+#else
+#  define INTEGRATOR_SHADOW_ISECT_SIZE INTEGRATOR_SHADOW_ISECT_SIZE_GPU
+#endif

 /* Data structures */

@@ -74,9 +82,9 @@ typedef struct IntegratorStateCPU {
 #define KERNEL_STRUCT_END(name) \
  } \
  name;
-#define KERNEL_STRUCT_END_ARRAY(name, size) \
+#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
  } \
-  name[size];
+  name[cpu_size];
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
@@ -103,9 +111,9 @@ typedef struct IntegratorStateGPU {
 #define KERNEL_STRUCT_END(name) \
  } \
  name;
-#define KERNEL_STRUCT_END_ARRAY(name, size) \
+#define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
  } \
-  name[size];
+  name[gpu_size];
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
--- a/intern/cycles/kernel/integrator/integrator_state_template.h
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -107,7 +107,7 @@ KERNEL_STRUCT_END(subsurface)
 KERNEL_STRUCT_BEGIN(volume_stack)
 KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
+KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE, INTEGRATOR_VOLUME_STACK_SIZE)

 /********************************* Shadow Path State **************************/

@@ -153,11 +153,15 @@ KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, object, KERNEL_FEATURE_PATH_TRACIN
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, int, type, KERNEL_FEATURE_PATH_TRACING)
 /* TODO: exclude for GPU. */
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_isect, float3, Ng, KERNEL_FEATURE_PATH_TRACING)
-KERNEL_STRUCT_END_ARRAY(shadow_isect, INTEGRATOR_SHADOW_ISECT_SIZE)
+KERNEL_STRUCT_END_ARRAY(shadow_isect,
+                        INTEGRATOR_SHADOW_ISECT_SIZE_CPU,
+                        INTEGRATOR_SHADOW_ISECT_SIZE_GPU)

 /**************************** Shadow Volume Stack *****************************/

 KERNEL_STRUCT_BEGIN(shadow_volume_stack)
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_END_ARRAY(shadow_volume_stack, INTEGRATOR_VOLUME_STACK_SIZE)
+KERNEL_STRUCT_END_ARRAY(shadow_volume_stack,
+                        INTEGRATOR_VOLUME_STACK_SIZE,
+                        INTEGRATOR_VOLUME_STACK_SIZE)
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -217,10 +217,10 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state
    while (false) \
      ;

-#  define KERNEL_STRUCT_END_ARRAY(name, array_size) \
+#  define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
    ++index; \
    } \
-    while (index < array_size) \
+    while (index < gpu_array_size) \
      ;

 #  include "kernel/integrator/integrator_state_template.h"
@@ -264,7 +264,12 @@ ccl_device_inline void integrator_state_shadow_catcher_split(INTEGRATOR_STATE_AR

  IntegratorStateCPU *ccl_restrict split_state = state + 1;

-  *split_state = *state;
+  /* Only copy the required subset, since shadow intersections are big and irrelevant here. */
+  split_state->path = state->path;
+  split_state->ray = state->ray;
+  split_state->isect = state->isect;
+  memcpy(split_state->volume_stack, state->volume_stack, sizeof(state->volume_stack));
+  split_state->shadow_path = state->shadow_path;

  split_state->path.flag |= PATH_RAY_SHADOW_CATCHER_PASS;
 #endif
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -386,7 +386,7 @@ ccl_device_inline void kernel_accum_light(INTEGRATOR_STATE_CONST_ARGS,
 {
  /* The throughput for shadow paths already contains the light shader evaluation. */
  float3 contribution = INTEGRATOR_STATE(shadow_path, throughput);
-  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce) - 1);
+  kernel_accum_clamp(kg, &contribution, INTEGRATOR_STATE(shadow_path, bounce));

  ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
                                                              render_buffer);
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -42,6 +42,16 @@ ccl_device void kernel_displace_evaluate(const KernelGlobals *kg,

  object_inverse_dir_transform(kg, &sd, &D);

+#ifdef __KERNEL_DEBUG_NAN__
+  if (!isfinite3_safe(D)) {
+    kernel_assert(!"Cycles displacement with non-finite value detected");
+  }
+#endif
+
+  /* Ensure finite displacement, preventing BVH from becoming degenerate and avoiding possible
+   * traversal issues caused by non-finite math. */
+  D = ensure_finite3(D);
+
  /* Write output. */
  output[offset] += make_float4(D.x, D.y, D.z, 0.0f);
 }
@@ -66,7 +76,16 @@ ccl_device void kernel_background_evaluate(const KernelGlobals *kg,
  const int path_flag = PATH_RAY_EMISSION;
  shader_eval_surface<KERNEL_FEATURE_NODE_MASK_SURFACE_LIGHT>(
      INTEGRATOR_STATE_PASS_NULL, &sd, NULL, path_flag);
-  const float3 color = shader_background_eval(&sd);
+  float3 color = shader_background_eval(&sd);
+
+#ifdef __KERNEL_DEBUG_NAN__
+  if (!isfinite3_safe(color)) {
+    kernel_assert(!"Cycles background with non-finite value detected");
+  }
+#endif
+
+  /* Ensure finite color, avoiding possible numerical instabilities in the path tracing kernels. */
+  color = ensure_finite3(color);

  /* Write output. */
  output[offset] += make_float4(color.x, color.y, color.z, 0.0f);
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -394,7 +394,7 @@ film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_conver

  /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual
   * shadow catcher objects in the scene. In this case there will be no auxiliary passes required
-   * for the devision (to save up memory). So delay the asserts to this point so that the number of
+   * for the decision (to save up memory). So delay the asserts to this point so that the number of
   * samples check handles such configuration. */
  kernel_assert(kfilm_convert->pass_offset != PASS_UNUSED);
  kernel_assert(kfilm_convert->pass_combined != PASS_UNUSED);
--- a/intern/cycles/kernel/kernel_jitter.h
+++ b/intern/cycles/kernel/kernel_jitter.h
@@ -74,10 +74,6 @@ ccl_device_inline float cmj_randfloat_simple(uint i, uint p)

 ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_hash, uint dimension)
 {
-  /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
-   *  the x part is used as the sample (TODO(@leesonw): Add using both x and y parts
-   * independently). */
-
  /* Perform Owen shuffle of the sample number to reorder the samples. */
 #ifdef _SIMPLE_HASH_
  const uint rv = cmj_hash_simple(dimension, rng_hash);
@@ -95,7 +91,10 @@ ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_ha
  const uint sample_set = s / NUM_PMJ_SAMPLES;
  const uint d = (dimension + sample_set);
  const uint dim = d % NUM_PMJ_PATTERNS;
-  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));
+
+  /* The PMJ sample sets contain a sample with (x,y) with NUM_PMJ_SAMPLES so for 1D
+   *  the x part is used for even dims and the y for odd. */
+  int index = 2 * ((dim >> 1) * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES)) + (dim & 1);

  float fx = kernel_tex_fetch(__sample_pattern_lut, index);

@@ -104,12 +103,11 @@ ccl_device float pmj_sample_1D(const KernelGlobals *kg, uint sample, uint rng_ha
 #  ifdef _SIMPLE_HASH_
  float dx = cmj_randfloat_simple(d, rng_hash);
 #  else
-  /* Only jitter within the grid interval. */
  float dx = cmj_randfloat(d, rng_hash);
 #  endif
-  fx = fx + dx * (1.0f / NUM_PMJ_SAMPLES);
+  /* Jitter sample locations and map back into [0 1]. */
+  fx = fx + dx;
  fx = fx - floorf(fx);
-
 #else
 #  warning "Not using Cranley-Patterson Rotation."
 #endif
@@ -136,7 +134,7 @@ ccl_device void pmj_sample_2D(
  /* Based on the sample number a sample pattern is selected and offset by the dimension. */
  const uint sample_set = s / NUM_PMJ_SAMPLES;
  const uint d = (dimension + sample_set);
-  const uint dim = d % NUM_PMJ_PATTERNS;
+  uint dim = d % NUM_PMJ_PATTERNS;
  int index = 2 * (dim * NUM_PMJ_SAMPLES + (s % NUM_PMJ_SAMPLES));

  float fx = kernel_tex_fetch(__sample_pattern_lut, index);
@@ -151,17 +149,17 @@ ccl_device void pmj_sample_2D(
  float dx = cmj_randfloat(d, rng_hash);
  float dy = cmj_randfloat(d + 1, rng_hash);
 #  endif
-  /* Only jitter within the grid cells. */
-  fx = fx + dx * (1.0f / NUM_PMJ_DIVISIONS);
-  fy = fy + dy * (1.0f / NUM_PMJ_DIVISIONS);
-  fx = fx - floorf(fx);
-  fy = fy - floorf(fy);
+  /* Jitter sample locations and map back to the unit square [0 1]x[0 1]. */
+  float sx = fx + dx;
+  float sy = fy + dy;
+  sx = sx - floorf(sx);
+  sy = sy - floorf(sy);
 #else
 #  warning "Not using Cranley Patterson Rotation."
 #endif

-  (*x) = fx;
-  (*y) = fy;
+  (*x) = sx;
+  (*y) = sy;
 }

 CCL_NAMESPACE_END
--- a/intern/cycles/kernel/kernel_shader.h
+++ b/intern/cycles/kernel/kernel_shader.h
@@ -750,7 +750,7 @@ ccl_device int shader_phase_sample_closure(const KernelGlobals *kg,

 /* Volume Evaluation */

-template<typename StackReadOp>
+template<const bool shadow, typename StackReadOp>
 ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
                                          ShaderData *ccl_restrict sd,
                                          const int path_flag,
@@ -815,8 +815,11 @@ ccl_device_inline void shader_eval_volume(INTEGRATOR_STATE_CONST_ARGS,
 #  endif

    /* Merge closures to avoid exceeding number of closures limit. */
-    if (i > 0)
-      shader_merge_volume_closures(sd);
+    if (!shadow) {
+      if (i > 0) {
+        shader_merge_volume_closures(sd);
+      }
+    }
  }
 }

--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -572,6 +572,7 @@ typedef enum AttributeStandard {
  ATTR_STD_MOTION_VERTEX_NORMAL,
  ATTR_STD_PARTICLE,
  ATTR_STD_CURVE_INTERCEPT,
+  ATTR_STD_CURVE_LENGTH,
  ATTR_STD_CURVE_RANDOM,
  ATTR_STD_PTEX_FACE_ID,
  ATTR_STD_PTEX_UV,
--- a/intern/cycles/kernel/osl/osl_services.cpp
+++ b/intern/cycles/kernel/osl/osl_services.cpp
@@ -107,6 +107,7 @@ ustring OSLRenderServices::u_geom_undisplaced("geom:undisplaced");
 ustring OSLRenderServices::u_is_smooth("geom:is_smooth");
 ustring OSLRenderServices::u_is_curve("geom:is_curve");
 ustring OSLRenderServices::u_curve_thickness("geom:curve_thickness");
+ustring OSLRenderServices::u_curve_length("geom:curve_length");
 ustring OSLRenderServices::u_curve_tangent_normal("geom:curve_tangent_normal");
 ustring OSLRenderServices::u_curve_random("geom:curve_random");
 ustring OSLRenderServices::u_path_ray_length("path:ray_length");
--- a/intern/cycles/kernel/osl/osl_services.h
+++ b/intern/cycles/kernel/osl/osl_services.h
@@ -294,6 +294,7 @@ class OSLRenderServices : public OSL::RendererServices {
  static ustring u_is_smooth;
  static ustring u_is_curve;
  static ustring u_curve_thickness;
+  static ustring u_curve_length;
  static ustring u_curve_tangent_normal;
  static ustring u_curve_random;
  static ustring u_path_ray_length;
--- a/intern/cycles/kernel/shaders/node_hair_info.osl
+++ b/intern/cycles/kernel/shaders/node_hair_info.osl
@@ -18,12 +18,14 @@

 shader node_hair_info(output float IsStrand = 0.0,
                      output float Intercept = 0.0,
+                      output float Length = 0.0,
                      output float Thickness = 0.0,
                      output normal TangentNormal = N,
                      output float Random = 0)
 {
  getattribute("geom:is_curve", IsStrand);
  getattribute("geom:curve_intercept", Intercept);
+  getattribute("geom:curve_length", Length);
  getattribute("geom:curve_thickness", Thickness);
  getattribute("geom:curve_tangent_normal", TangentNormal);
  getattribute("geom:curve_random", Random);
--- a/intern/cycles/kernel/svm/svm_geometry.h
+++ b/intern/cycles/kernel/svm/svm_geometry.h
@@ -213,6 +213,8 @@ ccl_device_noinline void svm_node_hair_info(
    }
    case NODE_INFO_CURVE_INTERCEPT:
      break; /* handled as attribute */
+    case NODE_INFO_CURVE_LENGTH:
+      break; /* handled as attribute */
    case NODE_INFO_CURVE_RANDOM:
      break; /* handled as attribute */
    case NODE_INFO_CURVE_THICKNESS: {
--- a/intern/cycles/kernel/svm/svm_types.h
+++ b/intern/cycles/kernel/svm/svm_types.h
@@ -173,6 +173,7 @@ typedef enum NodeParticleInfo {
 typedef enum NodeHairInfo {
  NODE_INFO_CURVE_IS_STRAND,
  NODE_INFO_CURVE_INTERCEPT,
+  NODE_INFO_CURVE_LENGTH,
  NODE_INFO_CURVE_THICKNESS,
  /* Fade for minimum hair width transiency. */
  // NODE_INFO_CURVE_FADE,
--- a/intern/cycles/render/attribute.cpp
+++ b/intern/cycles/render/attribute.cpp
@@ -342,6 +342,8 @@ const char *Attribute::standard_name(AttributeStandard std)
      return "particle";
    case ATTR_STD_CURVE_INTERCEPT:
      return "curve_intercept";
+    case ATTR_STD_CURVE_LENGTH:
+      return "curve_length";
    case ATTR_STD_CURVE_RANDOM:
      return "curve_random";
    case ATTR_STD_PTEX_FACE_ID:
@@ -586,6 +588,9 @@ Attribute *AttributeSet::add(AttributeStandard std, ustring name)
      case ATTR_STD_CURVE_INTERCEPT:
        attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE_KEY);
        break;
+      case ATTR_STD_CURVE_LENGTH:
+        attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE);
+        break;
      case ATTR_STD_CURVE_RANDOM:
        attr = add(name, TypeDesc::TypeFloat, ATTR_ELEMENT_CURVE);
        break;
--- a/intern/cycles/render/buffers.cpp
+++ b/intern/cycles/render/buffers.cpp
@@ -22,7 +22,6 @@
 #include "util/util_foreach.h"
 #include "util/util_hash.h"
 #include "util/util_math.h"
-#include "util/util_opengl.h"
 #include "util/util_time.h"
 #include "util/util_types.h"

--- a/intern/cycles/render/film.cpp
+++ b/intern/cycles/render/film.cpp
@@ -434,7 +434,8 @@ void Film::update_passes(Scene *scene, bool add_sample_count_pass)
  const ObjectManager *object_manager = scene->object_manager;
  Integrator *integrator = scene->integrator;

-  if (!is_modified() && !object_manager->need_update() && !integrator->is_modified()) {
+  if (!is_modified() && !object_manager->need_update() && !integrator->is_modified() &&
+      !background->is_modified()) {
    return;
  }

--- a/intern/cycles/render/geometry.cpp
+++ b/intern/cycles/render/geometry.cpp
@@ -794,11 +794,6 @@ void GeometryManager::device_update_attributes(Device *device,
    foreach (AttributeRequest &req, attributes.requests) {
      Attribute *attr = geom->attributes.find(req);

-      /* Vertex normals are stored in DeviceScene.tri_vnormal. */
-      if (attr && attr->std == ATTR_STD_VERTEX_NORMAL) {
-        continue;
-      }
-
      update_attribute_element_size(geom,
                                    attr,
                                    ATTR_PRIM_GEOMETRY,
@@ -811,11 +806,6 @@ void GeometryManager::device_update_attributes(Device *device,
        Mesh *mesh = static_cast<Mesh *>(geom);
        Attribute *subd_attr = mesh->subd_attributes.find(req);

-        /* Vertex normals are stored in DeviceScene.tri_vnormal. */
-        if (subd_attr && subd_attr->std == ATTR_STD_VERTEX_NORMAL) {
-          continue;
-        }
-
        update_attribute_element_size(mesh,
                                      subd_attr,
                                      ATTR_PRIM_SUBD,
@@ -870,11 +860,6 @@ void GeometryManager::device_update_attributes(Device *device,
      Attribute *attr = geom->attributes.find(req);

      if (attr) {
-        /* Vertex normals are stored in DeviceScene.tri_vnormal. */
-        if (attr->std == ATTR_STD_VERTEX_NORMAL) {
-          continue;
-        }
-
        /* force a copy if we need to reallocate all the data */
        attr->modified |= attributes_need_realloc[Attribute::kernel_type(*attr)];
      }
@@ -898,11 +883,6 @@ void GeometryManager::device_update_attributes(Device *device,
        Attribute *subd_attr = mesh->subd_attributes.find(req);

        if (subd_attr) {
-          /* Vertex normals are stored in DeviceScene.tri_vnormal. */
-          if (subd_attr->std == ATTR_STD_VERTEX_NORMAL) {
-            continue;
-          }
-
          /* force a copy if we need to reallocate all the data */
          subd_attr->modified |= attributes_need_realloc[Attribute::kernel_type(*subd_attr)];
        }
--- a/intern/cycles/render/gpu_display.h
+++ b/intern/cycles/render/gpu_display.h
@@ -46,7 +46,7 @@ class GPUDisplayParams {
   * NOTE: Is not affected by the resolution divider. */
  int2 full_size = make_int2(0, 0);

-  /* Effective vieport size.
+  /* Effective viewport size.
   * In the case of border render, size of the border rectangle.
   *
   * NOTE: Is not affected by the resolution divider. */
@@ -163,7 +163,7 @@ class GPUDisplay {
   * This call might happen in parallel with draw, but can never happen in parallel with the
   * update.
   *
-   * The actual zero-ing can be deferred to a later moment. What is important is that after clear
+   * The actual zeroing can be deferred to a later moment. What is important is that after clear
   * and before pixels update the drawing texture will be fully empty, and that partial update
   * after clear will write new pixel values for an updating area, leaving everything else zeroed.
   *
--- a/intern/cycles/render/integrator.h
+++ b/intern/cycles/render/integrator.h
@@ -19,7 +19,7 @@

 #include "kernel/kernel_types.h"

-#include "device/device_denoise.h" /* For the paramaters and type enum. */
+#include "device/device_denoise.h" /* For the parameters and type enum. */
 #include "graph/node.h"
 #include "integrator/adaptive_sampling.h"

--- a/intern/cycles/render/nodes.cpp
+++ b/intern/cycles/render/nodes.cpp
@@ -4368,6 +4368,7 @@ NODE_DEFINE(HairInfoNode)

  SOCKET_OUT_FLOAT(is_strand, "Is Strand");
  SOCKET_OUT_FLOAT(intercept, "Intercept");
+  SOCKET_OUT_FLOAT(size, "Length");
  SOCKET_OUT_FLOAT(thickness, "Thickness");
  SOCKET_OUT_NORMAL(tangent_normal, "Tangent Normal");
 #if 0 /* Output for minimum hair width transparency - deactivated. */
@@ -4390,6 +4391,9 @@ void HairInfoNode::attributes(Shader *shader, AttributeRequestSet *attributes)
    if (!intercept_out->links.empty())
      attributes->add(ATTR_STD_CURVE_INTERCEPT);

+    if (!output("Length")->links.empty())
+      attributes->add(ATTR_STD_CURVE_LENGTH);
+
    if (!output("Random")->links.empty())
      attributes->add(ATTR_STD_CURVE_RANDOM);
  }
@@ -4412,6 +4416,12 @@ void HairInfoNode::compile(SVMCompiler &compiler)
    compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_OUTPUT_FLOAT);
  }

+  out = output("Length");
+  if (!out->links.empty()) {
+    int attr = compiler.attribute(ATTR_STD_CURVE_LENGTH);
+    compiler.add_node(NODE_ATTR, attr, compiler.stack_assign(out), NODE_ATTR_OUTPUT_FLOAT);
+  }
+
  out = output("Thickness");
  if (!out->links.empty()) {
    compiler.add_node(NODE_HAIR_INFO, NODE_INFO_CURVE_THICKNESS, compiler.stack_assign(out));
@@ -6749,8 +6759,6 @@ void NormalMapNode::attributes(Shader *shader, AttributeRequestSet *attributes)
      attributes->add(ustring((string(attribute.c_str()) + ".tangent").c_str()));
      attributes->add(ustring((string(attribute.c_str()) + ".tangent_sign").c_str()));
    }
-
-    attributes->add(ATTR_STD_VERTEX_NORMAL);
  }

  ShaderNode::attributes(shader, attributes);
@@ -7026,8 +7034,6 @@ void VectorDisplacementNode::attributes(Shader *shader, AttributeRequestSet *att
      attributes->add(ustring((string(attribute.c_str()) + ".tangent").c_str()));
      attributes->add(ustring((string(attribute.c_str()) + ".tangent_sign").c_str()));
    }
-
-    attributes->add(ATTR_STD_VERTEX_NORMAL);
  }

  ShaderNode::attributes(shader, attributes);
--- a/Show More
+++ b/Show More