@ -37,6 +37,7 @@
# include <cudaGL.h>
# endif
# include "util/util_debug.h"
# include "util/util_foreach.h"
# include "util/util_logging.h"
# include "util/util_map.h"
# include "util/util_md5.h"
@ -128,6 +129,12 @@ public:
CUdevice cuDevice ;
CUcontext cuContext ;
CUmodule cuModule , cuFilterModule ;
size_t device_texture_headroom ;
size_t device_working_headroom ;
bool move_texture_to_host ;
size_t map_host_used ;
size_t map_host_limit ;
int can_map_host ;
int cuDevId ;
int cuDevArchitecture ;
bool first_error ;
@ -135,12 +142,15 @@ public:
struct CUDAMem {
CUDAMem ( )
: texobject ( 0 ) , array ( 0 ) { }
: texobject ( 0 ) , array ( 0 ) , map_host_pointer ( 0 ) , free_map_host ( false ) { }
CUtexObject texobject ;
CUarray array ;
void * map_host_pointer ;
bool free_map_host ;
} ;
map < device_memory * , CUDAMem > cuda_mem_map ;
typedef map < device_memory * , CUDAMem > CUDAMemMap ;
CUDAMemMap cuda_mem_map ;
struct PixelMem {
GLuint cuPBO ;
@ -240,6 +250,13 @@ public:
need_texture_info = false ;
device_texture_headroom = 0 ;
device_working_headroom = 0 ;
move_texture_to_host = false ;
map_host_limit = 0 ;
map_host_used = 0 ;
can_map_host = 0 ;
/* Intialize CUDA. */
if ( cuda_error ( cuInit ( 0 ) ) )
return ;
@ -248,9 +265,16 @@ public:
if ( cuda_error ( cuDeviceGet ( & cuDevice , cuDevId ) ) )
return ;
/* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
/* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
* CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render ,
* so we can predict which memory to map to host . */
cuda_assert ( cuDeviceGetAttribute ( & can_map_host , CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY , cuDevice ) ) ;
unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX ;
if ( can_map_host ) {
ctx_flags | = CU_CTX_MAP_HOST ;
init_host_memory ( ) ;
}
/* Create context. */
CUresult result ;
@ -611,6 +635,50 @@ public:
VLOG ( 1 ) < < " Local memory reserved "
< < string_human_readable_number ( free_before - free_after ) < < " bytes. ( "
< < string_human_readable_size ( free_before - free_after ) < < " ) " ;
#if 0
/* For testing mapped host memory, fill up device memory. */
const size_t keep_mb = 1024 ;
while ( free_after > keep_mb * 1024 * 1024LL ) {
CUdeviceptr tmp ;
cuda_assert ( cuMemAlloc ( & tmp , 10 * 1024 * 1024LL ) ) ;
cuMemGetInfo ( & free_after , & total ) ;
}
# endif
}
void init_host_memory ( )
{
/* Limit amount of host mapped memory, because allocating too much can
* cause system instability . Leave at least half or 4 GB of system
* memory free , whichever is smaller . */
size_t default_limit = 4 * 1024 * 1024 * 1024LL ;
size_t system_ram = system_physical_ram ( ) ;
if ( system_ram > 0 ) {
if ( system_ram / 2 > default_limit ) {
map_host_limit = system_ram - default_limit ;
}
else {
map_host_limit = system_ram / 2 ;
}
}
else {
VLOG ( 1 ) < < " Mapped host memory disabled, failed to get system RAM " ;
map_host_limit = 0 ;
}
/* Amount of device memory to keep is free after texture memory
* and working memory allocations respectively . We set the working
* memory limit headroom lower so that some space is left after all
* texture memory allocations . */
device_working_headroom = 32 * 1024 * 1024LL ; // 32MB
device_texture_headroom = 128 * 1024 * 1024LL ; // 128MB
VLOG ( 1 ) < < " Mapped host memory limit set to "
< < string_human_readable_number ( map_host_limit ) < < " bytes. ( "
< < string_human_readable_size ( map_host_limit ) < < " ) " ;
}
void load_texture_info ( )
@ -621,20 +689,167 @@ public:
}
}
CUDAMem * generic_alloc ( device_memory & mem , size_t padding = 0 )
void move_textures_to_host ( size_t size , bool for_texture )
{
/* Signal to reallocate textures in host memory only. */
move_texture_to_host = true ;
while ( size > 0 ) {
/* Find suitable memory allocation to move. */
device_memory * max_mem = NULL ;
size_t max_size = 0 ;
bool max_is_image = false ;
foreach ( CUDAMemMap : : value_type & pair , cuda_mem_map ) {
device_memory & mem = * pair . first ;
CUDAMem * cmem = & pair . second ;
bool is_texture = ( mem . type = = MEM_TEXTURE ) & & ( & mem ! = & texture_info ) ;
bool is_image = is_texture & & ( mem . data_height > 1 ) ;
/* Can't move this type of memory. */
if ( ! is_texture | | cmem - > array ) {
continue ;
}
/* Already in host memory. */
if ( cmem - > map_host_pointer ) {
continue ;
}
/* For other textures, only move image textures. */
if ( for_texture & & ! is_image ) {
continue ;
}
/* Try to move largest allocation, prefer moving images. */
if ( is_image > max_is_image | |
( is_image = = max_is_image & & mem . device_size > max_size ) ) {
max_is_image = is_image ;
max_size = mem . device_size ;
max_mem = & mem ;
}
}
/* Move to host memory. This part is mutex protected since
* multiple CUDA devices could be moving the memory . The
* first one will do it , and the rest will adopt the pointer . */
if ( max_mem ) {
VLOG ( 1 ) < < " Move memory from device to host: " < < max_mem - > name ;
static thread_mutex move_mutex ;
thread_scoped_lock lock ( move_mutex ) ;
/* Preserve the original device pointer, in case of multi device
* we can ' t change it because the pointer mapping would break . */
device_ptr prev_pointer = max_mem - > device_pointer ;
size_t prev_size = max_mem - > device_size ;
tex_free ( * max_mem ) ;
tex_alloc ( * max_mem ) ;
size = ( max_size > = size ) ? 0 : size - max_size ;
max_mem - > device_pointer = prev_pointer ;
max_mem - > device_size = prev_size ;
}
else {
break ;
}
}
/* Update texture info array with new pointers. */
load_texture_info ( ) ;
move_texture_to_host = false ;
}
CUDAMem * generic_alloc ( device_memory & mem , size_t pitch_padding = 0 )
{
CUDAContextScope scope ( this ) ;
CUdeviceptr device_pointer = 0 ;
size_t size = mem . memory_size ( ) + pitch_padding ;
CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY ;
const char * status = " " ;
/* First try allocating in device memory, respecting headroom. We make
* an exception for texture info . It is small and frequently accessed ,
* so treat it as working memory .
*
* If there is not enough room for working memory , we will try to move
* textures to host memory , assuming the performance impact would have
* been worse for working memory . */
bool is_texture = ( mem . type = = MEM_TEXTURE ) & & ( & mem ! = & texture_info ) ;
bool is_image = is_texture & & ( mem . data_height > 1 ) ;
size_t headroom = ( is_texture ) ? device_texture_headroom :
device_working_headroom ;
size_t total = 0 , free = 0 ;
cuMemGetInfo ( & free , & total ) ;
/* Move textures to host memory if needed. */
if ( ! move_texture_to_host & & ! is_image & & ( size + headroom ) > = free ) {
move_textures_to_host ( size + headroom - free , is_texture ) ;
cuMemGetInfo ( & free , & total ) ;
}
/* Allocate in device memory. */
if ( ! move_texture_to_host & & ( size + headroom ) < free ) {
mem_alloc_result = cuMemAlloc ( & device_pointer , size ) ;
if ( mem_alloc_result = = CUDA_SUCCESS ) {
status = " in device memory " ;
}
}
/* Fall back to mapped host memory if needed and possible. */
void * map_host_pointer = 0 ;
bool free_map_host = false ;
if ( mem_alloc_result ! = CUDA_SUCCESS & & can_map_host & &
map_host_used + size < map_host_limit ) {
if ( mem . shared_pointer ) {
/* Another device already allocated host memory. */
mem_alloc_result = CUDA_SUCCESS ;
map_host_pointer = mem . shared_pointer ;
}
else {
/* Allocate host memory ourselves. */
mem_alloc_result = cuMemHostAlloc ( & map_host_pointer , size ,
CU_MEMHOSTALLOC_DEVICEMAP |
CU_MEMHOSTALLOC_WRITECOMBINED ) ;
mem . shared_pointer = map_host_pointer ;
free_map_host = true ;
}
if ( mem_alloc_result = = CUDA_SUCCESS ) {
cuda_assert ( cuMemHostGetDevicePointer_v2 ( & device_pointer , mem . shared_pointer , 0 ) ) ;
map_host_used + = size ;
status = " in host memory " ;
/* Replace host pointer with our host allocation. Only works if
* CUDA memory layout is the same and has no pitch padding . */
if ( pitch_padding = = 0 & & mem . host_pointer & & mem . host_pointer ! = mem . shared_pointer ) {
memcpy ( mem . shared_pointer , mem . host_pointer , size ) ;
mem . host_free ( ) ;
mem . host_pointer = mem . shared_pointer ;
}
}
}
if ( mem_alloc_result ! = CUDA_SUCCESS ) {
cuda_assert ( mem_alloc_result ) ;
status = " failed, out of memory " ;
}
if ( mem . name ) {
VLOG ( 1 ) < < " Buffer allocate: " < < mem . name < < " , "
< < string_human_readable_number ( mem . memory_size ( ) ) < < " bytes. ( "
< < string_human_readable_size ( mem . memory_size ( ) ) < < " ) " ;
< < string_human_readable_size ( mem . memory_size ( ) ) < < " ) "
< < status ;
}
/* Allocate memory on device. */
CUdeviceptr device_pointer = 0 ;
size_t size = mem . memory_size ( ) ;
cuda_assert ( cuMemAlloc ( & device_pointer , size + padding ) ) ;
mem . device_pointer = ( device_ptr ) device_pointer ;
mem . device_size = size ;
stats . mem_alloc ( size ) ;
@ -645,6 +860,8 @@ public:
/* Insert into map of allocations. */
CUDAMem * cmem = & cuda_mem_map [ & mem ] ;
cmem - > map_host_pointer = map_host_pointer ;
cmem - > free_map_host = free_map_host ;
return cmem ;
}
@ -652,7 +869,12 @@ public:
{
if ( mem . host_pointer & & mem . device_pointer ) {
CUDAContextScope scope ( this ) ;
cuda_assert ( cuMemcpyHtoD ( cuda_device_ptr ( mem . device_pointer ) , mem . host_pointer , mem . memory_size ( ) ) ) ;
if ( mem . host_pointer ! = mem . shared_pointer ) {
cuda_assert ( cuMemcpyHtoD ( cuda_device_ptr ( mem . device_pointer ) ,
mem . host_pointer ,
mem . memory_size ( ) ) ) ;
}
}
}
@ -660,8 +882,24 @@ public:
{
if ( mem . device_pointer ) {
CUDAContextScope scope ( this ) ;
const CUDAMem & cmem = cuda_mem_map [ & mem ] ;
cuda_assert ( cuMemFree ( cuda_device_ptr ( mem . device_pointer ) ) ) ;
if ( cmem . map_host_pointer ) {
/* Free host memory. */
if ( cmem . free_map_host ) {
cuMemFreeHost ( cmem . map_host_pointer ) ;
if ( mem . host_pointer = = mem . shared_pointer ) {
mem . host_pointer = 0 ;
}
mem . shared_pointer = 0 ;
}
map_host_used - = mem . device_size ;
}
else {
/* Free device memory. */
cuMemFree ( mem . device_pointer ) ;
}
stats . mem_free ( mem . device_size ) ;
mem . device_pointer = 0 ;
@ -735,7 +973,8 @@ public:
memset ( mem . host_pointer , 0 , mem . memory_size ( ) ) ;
}
if ( mem . device_pointer ) {
if ( mem . device_pointer & &
( ! mem . host_pointer | | mem . host_pointer ! = mem . shared_pointer ) ) {
CUDAContextScope scope ( this ) ;
cuda_assert ( cuMemsetD8 ( cuda_device_ptr ( mem . device_pointer ) , 0 , mem . memory_size ( ) ) ) ;
}
@ -774,10 +1013,6 @@ public:
{
CUDAContextScope scope ( this ) ;
VLOG ( 1 ) < < " Texture allocate: " < < mem . name < < " , "
< < string_human_readable_number ( mem . memory_size ( ) ) < < " bytes. ( "
< < string_human_readable_size ( mem . memory_size ( ) ) < < " ) " ;
/* Check if we are on sm_30 or above, for bindless textures. */
bool has_fermi_limits = info . has_fermi_limits ;
@ -881,6 +1116,10 @@ public:
desc . NumChannels = mem . data_elements ;
desc . Flags = 0 ;
VLOG ( 1 ) < < " Array 3D allocate: " < < mem . name < < " , "
< < string_human_readable_number ( mem . memory_size ( ) ) < < " bytes. ( "
< < string_human_readable_size ( mem . memory_size ( ) ) < < " ) " ;
cuda_assert ( cuArray3DCreate ( & array_3d , & desc ) ) ;
if ( ! array_3d ) {