diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp index b5eaa69bf0e..5440bd91987 100644 --- a/intern/cycles/device/device_cuda.cpp +++ b/intern/cycles/device/device_cuda.cpp @@ -43,7 +43,9 @@ public: CUmodule cuModule; map tex_interp_map; int cuDevId; + int cuDevArchitecture; bool first_error; + bool use_texture_storage; struct PixelMem { GLuint cuPBO; @@ -173,6 +175,7 @@ public: { first_error = true; background = background_; + use_texture_storage = true; cuDevId = info.num; cuDevice = 0; @@ -203,6 +206,15 @@ public: if(cuda_error_(result, "cuCtxCreate")) return; + int major, minor; + cuDeviceComputeCapability(&major, &minor, cuDevId); + cuDevArchitecture = major*100 + minor*10; + + /* In order to use full 6GB of memory on Titan cards, use arrays instead + * of textures. On earlier cards this seems slower, but on Titan it is + * actually slightly faster in tests. */ + use_texture_storage = (cuDevArchitecture < 350); + cuda_pop_context(); } @@ -210,8 +222,7 @@ public: { task_pool.stop(); - cuda_push_context(); - cuda_assert(cuCtxDetach(cuContext)) + cuda_assert(cuCtxDestroy(cuContext)) } bool support_device(bool experimental) @@ -448,90 +459,118 @@ public: CUarray_format_enum format; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); + bool use_texture = interpolation || use_texture_storage; - switch(mem.data_type) { - case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; - case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; - case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; - case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; - default: assert(0); return; - } + if(use_texture) { - CUtexref texref = NULL; + switch(mem.data_type) { + case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; + case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; + case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; + case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; + default: assert(0); return; + } - cuda_push_context(); - cuda_assert(cuModuleGetTexRef(&texref, cuModule, name)) + CUtexref texref = NULL; - if(!texref) { - cuda_pop_context(); - return; - } + cuda_push_context(); + cuda_assert(cuModuleGetTexRef(&texref, cuModule, name)) - if(interpolation) { - CUarray handle = NULL; - CUDA_ARRAY_DESCRIPTOR desc; - - desc.Width = mem.data_width; - desc.Height = mem.data_height; - desc.Format = format; - desc.NumChannels = mem.data_elements; - - cuda_assert(cuArrayCreate(&handle, &desc)) - - if(!handle) { + if(!texref) { cuda_pop_context(); return; } - if(mem.data_height > 1) { - CUDA_MEMCPY2D param; - memset(¶m, 0, sizeof(param)); - param.dstMemoryType = CU_MEMORYTYPE_ARRAY; - param.dstArray = handle; - param.srcMemoryType = CU_MEMORYTYPE_HOST; - param.srcHost = (void*)mem.data_pointer; - param.srcPitch = mem.data_width*dsize*mem.data_elements; - param.WidthInBytes = param.srcPitch; - param.Height = mem.data_height; + if(interpolation) { + CUarray handle = NULL; + CUDA_ARRAY_DESCRIPTOR desc; - cuda_assert(cuMemcpy2D(¶m)) + desc.Width = mem.data_width; + desc.Height = mem.data_height; + desc.Format = format; + desc.NumChannels = mem.data_elements; + + cuda_assert(cuArrayCreate(&handle, &desc)) + + if(!handle) { + cuda_pop_context(); + return; + } + + if(mem.data_height > 1) { + CUDA_MEMCPY2D param; + memset(¶m, 0, sizeof(param)); + param.dstMemoryType = CU_MEMORYTYPE_ARRAY; + param.dstArray = handle; + param.srcMemoryType = CU_MEMORYTYPE_HOST; + param.srcHost = (void*)mem.data_pointer; + param.srcPitch = mem.data_width*dsize*mem.data_elements; + param.WidthInBytes = param.srcPitch; + param.Height = mem.data_height; + + cuda_assert(cuMemcpy2D(¶m)) + } + else + cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)) + + cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)) + + cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)) + cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)) + + mem.device_pointer = (device_ptr)handle; + + stats.mem_alloc(size); } - else - cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size)) + else { + cuda_pop_context(); - cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT)) + mem_alloc(mem, MEM_READ_ONLY); + mem_copy_to(mem); - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR)) - cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES)) + cuda_push_context(); - mem.device_pointer = (device_ptr)handle; + cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)) + cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)) + cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)) + } - stats.mem_alloc(size); + if(periodic) { + cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP)) + cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP)) + } + else { + cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP)) + cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP)) + } + cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)) + + cuda_pop_context(); } else { - cuda_pop_context(); - mem_alloc(mem, MEM_READ_ONLY); mem_copy_to(mem); cuda_push_context(); - cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size)) - cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT)) - cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER)) - } + CUdeviceptr cumem; + size_t cubytes; - if(periodic) { - cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP)) - cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP)) - } - else { - cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP)) - cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP)) - } - cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements)) + cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, name)) - cuda_pop_context(); + if(cubytes == 8) { + /* 64 bit device pointer */ + uint64_t ptr = mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)) + } + else { + /* 32 bit device pointer */ + uint32_t ptr = (uint32_t)mem.device_pointer; + cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes)) + } + + cuda_pop_context(); + } tex_interp_map[mem.device_pointer] = interpolation; } diff --git a/intern/cycles/kernel/kernel_bvh.h b/intern/cycles/kernel/kernel_bvh.h index 4b01f2eebcd..44a9822c103 100644 --- a/intern/cycles/kernel/kernel_bvh.h +++ b/intern/cycles/kernel/kernel_bvh.h @@ -809,11 +809,16 @@ __device_inline void bvh_triangle_intersect_subsurface(KernelGlobals *kg, Inters #include "kernel_bvh_subsurface.h" #endif - -#ifdef __HAIR__ -__device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax) +/* to work around titan bug when using arrays instead of textures */ +#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) +__device_inline #else -__device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect) +__device_noinline +#endif +#ifdef __HAIR__ +bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect, uint *lcg_state, float difl, float extmax) +#else +bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect) #endif { #ifdef __OBJECT_MOTION__ @@ -851,8 +856,14 @@ __device_inline bool scene_intersect(KernelGlobals *kg, const Ray *ray, const ui #endif /* __KERNEL_CPU__ */ } +/* to work around titan bug when using arrays instead of textures */ #ifdef __SUBSURFACE__ -__device_inline uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits) +#if !defined(__KERNEL_CUDA__) || defined(__KERNEL_CUDA_TEX_STORAGE__) +__device_inline +#else +__device_noinline +#endif +uint scene_intersect_subsurface(KernelGlobals *kg, const Ray *ray, Intersection *isect, int subsurface_object, uint *lcg_state, int max_hits) { #ifdef __OBJECT_MOTION__ if(kernel_data.bvh.have_motion) { diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h index cb86ce8c4ae..44c2b9effe9 100644 --- a/intern/cycles/kernel/kernel_compat_cuda.h +++ b/intern/cycles/kernel/kernel_compat_cuda.h @@ -57,7 +57,18 @@ typedef texture texture_image_uchar4; /* Macros to handle different memory storage on different devices */ +/* In order to use full 6GB of memory on Titan cards, use arrays instead + * of textures. On earlier cards this seems slower, but on Titan it is + * actually slightly faster in tests. */ +#if __CUDA_ARCH__ < 350 +#define __KERNEL_CUDA_TEX_STORAGE__ +#endif + +#ifdef __KERNEL_CUDA_TEX_STORAGE__ #define kernel_tex_fetch(t, index) tex1Dfetch(t, index) +#else +#define kernel_tex_fetch(t, index) t[(index)] +#endif #define kernel_tex_image_interp(t, x, y) tex2D(t, x, y) #define kernel_data __data diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h index ab0a717b592..b5e691eb615 100644 --- a/intern/cycles/kernel/kernel_globals.h +++ b/intern/cycles/kernel/kernel_globals.h @@ -66,7 +66,11 @@ typedef struct KernelGlobals { __constant__ KernelData __data; typedef struct KernelGlobals {} KernelGlobals; +#ifdef __KERNEL_CUDA_TEX_STORAGE__ #define KERNEL_TEX(type, ttype, name) ttype name; +#else +#define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name; +#endif #define KERNEL_IMAGE_TEX(type, ttype, name) ttype name; #include "kernel_textures.h" diff --git a/intern/cycles/kernel/kernel_primitive.h b/intern/cycles/kernel/kernel_primitive.h index 4a06dff84bf..636cfd06532 100644 --- a/intern/cycles/kernel/kernel_primitive.h +++ b/intern/cycles/kernel/kernel_primitive.h @@ -93,7 +93,11 @@ __device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) { #ifdef __HAIR__ if(sd->segment != ~0) +#ifdef __DPDU__ return normalize(sd->dPdu); +#else + return make_float3(0.0f, 0.0f, 0.0f); +#endif #endif /* try to create spherical tangent from generated coordinates */ @@ -108,7 +112,11 @@ __device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd) } else { /* otherwise use surface derivatives */ +#ifdef __DPDU__ return normalize(sd->dPdu); +#else + return make_float3(0.0f, 0.0f, 0.0f); +#endif } }