/* * Copyright 2011-2013 Blender Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifdef WITH_OPENCL #include "opencl.h" #include "util_logging.h" #include "util_path.h" #include "util_time.h" using std::cerr; using std::endl; CCL_NAMESPACE_BEGIN OpenCLCache::Slot::ProgramEntry::ProgramEntry() : program(NULL), mutex(NULL) { } OpenCLCache::Slot::ProgramEntry::ProgramEntry(const ProgramEntry& rhs) : program(rhs.program), mutex(NULL) { } OpenCLCache::Slot::ProgramEntry::~ProgramEntry() { delete mutex; } OpenCLCache::Slot::Slot() : context_mutex(NULL), context(NULL) { } OpenCLCache::Slot::Slot(const Slot& rhs) : context_mutex(NULL), context(NULL), programs(rhs.programs) { } OpenCLCache::Slot::~Slot() { delete context_mutex; } OpenCLCache& OpenCLCache::global_instance() { static OpenCLCache instance; return instance; } cl_context OpenCLCache::get_context(cl_platform_id platform, cl_device_id device, thread_scoped_lock& slot_locker) { assert(platform != NULL); OpenCLCache& self = global_instance(); thread_scoped_lock cache_lock(self.cache_lock); pair ins = self.cache.insert( CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); Slot &slot = ins.first->second; /* create slot lock only while holding cache lock */ if(!slot.context_mutex) slot.context_mutex = new thread_mutex; /* need to unlock cache before locking slot, to allow store to complete */ cache_lock.unlock(); /* lock the slot */ slot_locker = thread_scoped_lock(*slot.context_mutex); /* If the thing isn't cached */ if(slot.context == NULL) { /* return with the caller's lock holder holding the slot lock */ return NULL; } /* the item was already cached, release the slot lock */ slot_locker.unlock(); cl_int ciErr = clRetainContext(slot.context); assert(ciErr == CL_SUCCESS); (void)ciErr; return slot.context; } cl_program OpenCLCache::get_program(cl_platform_id platform, cl_device_id device, ustring key, thread_scoped_lock& slot_locker) { assert(platform != NULL); OpenCLCache& self = global_instance(); thread_scoped_lock cache_lock(self.cache_lock); pair ins = self.cache.insert( CacheMap::value_type(PlatformDevicePair(platform, device), Slot())); Slot &slot = ins.first->second; pair ins2 = slot.programs.insert( Slot::EntryMap::value_type(key, Slot::ProgramEntry())); Slot::ProgramEntry &entry = ins2.first->second; /* create slot lock only while holding cache lock */ if(!entry.mutex) entry.mutex = new thread_mutex; /* need to unlock cache before locking slot, to allow store to complete */ cache_lock.unlock(); /* lock the slot */ slot_locker = thread_scoped_lock(*entry.mutex); /* If the thing isn't cached */ if(entry.program == NULL) { /* return with the caller's lock holder holding the slot lock */ return NULL; } /* the item was already cached, release the slot lock */ slot_locker.unlock(); cl_int ciErr = clRetainProgram(entry.program); assert(ciErr == CL_SUCCESS); (void)ciErr; return entry.program; } void OpenCLCache::store_context(cl_platform_id platform, cl_device_id device, cl_context context, thread_scoped_lock& slot_locker) { assert(platform != NULL); assert(device != NULL); assert(context != NULL); OpenCLCache &self = global_instance(); thread_scoped_lock cache_lock(self.cache_lock); CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); cache_lock.unlock(); Slot &slot = i->second; /* sanity check */ assert(i != self.cache.end()); assert(slot.context == NULL); slot.context = context; /* unlock the slot */ slot_locker.unlock(); /* increment reference count in OpenCL. * The caller is going to release the object when done with it. */ cl_int ciErr = clRetainContext(context); assert(ciErr == CL_SUCCESS); (void)ciErr; } void OpenCLCache::store_program(cl_platform_id platform, cl_device_id device, cl_program program, ustring key, thread_scoped_lock& slot_locker) { assert(platform != NULL); assert(device != NULL); assert(program != NULL); OpenCLCache &self = global_instance(); thread_scoped_lock cache_lock(self.cache_lock); CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device)); assert(i != self.cache.end()); Slot &slot = i->second; Slot::EntryMap::iterator i2 = slot.programs.find(key); assert(i2 != slot.programs.end()); Slot::ProgramEntry &entry = i2->second; assert(entry.program == NULL); cache_lock.unlock(); entry.program = program; /* unlock the slot */ slot_locker.unlock(); /* Increment reference count in OpenCL. * The caller is going to release the object when done with it. */ cl_int ciErr = clRetainProgram(program); assert(ciErr == CL_SUCCESS); (void)ciErr; } string OpenCLCache::get_kernel_md5() { OpenCLCache &self = global_instance(); thread_scoped_lock lock(self.kernel_md5_lock); if(self.kernel_md5.empty()) { self.kernel_md5 = path_files_md5_hash(path_get("kernel")); } return self.kernel_md5; } OpenCLDeviceBase::OpenCLProgram::OpenCLProgram(OpenCLDeviceBase *device, string program_name, string kernel_file, string kernel_build_options) : device(device), program_name(program_name), kernel_file(kernel_file), kernel_build_options(kernel_build_options) { loaded = false; program = NULL; } OpenCLDeviceBase::OpenCLProgram::~OpenCLProgram() { release(); } void OpenCLDeviceBase::OpenCLProgram::release() { for(map::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) { if(kernel->second) { clReleaseKernel(kernel->second); kernel->second = NULL; } } if(program) { clReleaseProgram(program); program = NULL; } } void OpenCLDeviceBase::OpenCLProgram::add_kernel(ustring name) { if(!kernels.count(name)) { kernels[name] = NULL; } } bool OpenCLDeviceBase::OpenCLProgram::build_kernel(const string *debug_src) { string build_options; build_options = device->kernel_build_options(debug_src) + kernel_build_options; cl_int ciErr = clBuildProgram(program, 0, NULL, build_options.c_str(), NULL, NULL); /* show warnings even if build is successful */ size_t ret_val_size = 0; clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); if(ret_val_size > 1) { vector build_log(ret_val_size + 1); clGetProgramBuildInfo(program, device->cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL); build_log[ret_val_size] = '\0'; /* Skip meaningless empty output from the NVidia compiler. */ if(!(ret_val_size == 2 && build_log[0] == '\n')) { output_msg = string(&build_log[0]); } } if(ciErr != CL_SUCCESS) { error_msg = string("OpenCL build failed: ") + clewErrorString(ciErr); return false; } return true; } bool OpenCLDeviceBase::OpenCLProgram::compile_kernel(const string *debug_src) { string source = "#include \"kernels/opencl/" + kernel_file + "\" // " + OpenCLCache::get_kernel_md5() + "\n"; /* We compile kernels consisting of many files. unfortunately OpenCL * kernel caches do not seem to recognize changes in included files. * so we force recompile on changes by adding the md5 hash of all files. */ source = path_source_replace_includes(source, path_get("kernel")); if(debug_src) { path_write_text(*debug_src, source); } size_t source_len = source.size(); const char *source_str = source.c_str(); cl_int ciErr; program = clCreateProgramWithSource(device->cxContext, 1, &source_str, &source_len, &ciErr); if(ciErr != CL_SUCCESS) { error_msg = string("OpenCL program creation failed: ") + clewErrorString(ciErr); return false; } double starttime = time_dt(); log += "Build flags: " + kernel_build_options + "\n"; if(!build_kernel(debug_src)) return false; log += "Kernel compilation of " + program_name + " finished in " + string_printf("%.2lfs.\n", time_dt() - starttime); return true; } bool OpenCLDeviceBase::OpenCLProgram::load_binary(const string& clbin, const string *debug_src) { /* read binary into memory */ vector binary; if(!path_read_binary(clbin, binary)) { error_msg = "OpenCL failed to read cached binary " + clbin + "."; return false; } /* create program */ cl_int status, ciErr; size_t size = binary.size(); const uint8_t *bytes = &binary[0]; program = clCreateProgramWithBinary(device->cxContext, 1, &device->cdDevice, &size, &bytes, &status, &ciErr); if(status != CL_SUCCESS || ciErr != CL_SUCCESS) { error_msg = "OpenCL failed create program from cached binary " + clbin + ": " + clewErrorString(status) + " " + clewErrorString(ciErr); return false; } if(!build_kernel(debug_src)) return false; return true; } bool OpenCLDeviceBase::OpenCLProgram::save_binary(const string& clbin) { size_t size = 0; clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); if(!size) return false; vector binary(size); uint8_t *bytes = &binary[0]; clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL); return path_write_binary(clbin, binary); } void OpenCLDeviceBase::OpenCLProgram::load() { assert(device); loaded = false; string device_md5 = device->device_md5_hash(kernel_build_options); /* Try to use cached kernel. */ thread_scoped_lock cache_locker; ustring cache_key(program_name + device_md5); program = device->load_cached_kernel(cache_key, cache_locker); if(!program) { log += "OpenCL program " + program_name + " not found in cache.\n"; string basename = "cycles_kernel_" + program_name + "_" + device_md5 + "_" + OpenCLCache::get_kernel_md5(); basename = path_cache_get(path_join("kernels", basename)); string clbin = basename + ".clbin"; /* path to preprocessed source for debugging */ string clsrc, *debug_src = NULL; if(OpenCLInfo::use_debug()) { clsrc = basename + ".cl"; debug_src = &clsrc; } /* If binary kernel exists already, try use it. */ if(path_exists(clbin) && load_binary(clbin)) { /* Kernel loaded from binary, nothing to do. */ log += "Loaded program from " + clbin + ".\n"; } else { log += "Kernel file " + clbin + " either doesn't exist or failed to be loaded by driver.\n"; /* If does not exist or loading binary failed, compile kernel. */ if(!compile_kernel(debug_src)) { return; } /* Save binary for reuse. */ if(!save_binary(clbin)) { log += "Saving compiled OpenCL kernel to " + clbin + " failed!"; } } /* Cache the program. */ device->store_cached_kernel(program, cache_key, cache_locker); } else { log += "Found cached OpenCL program " + program_name + ".\n"; } for(map::iterator kernel = kernels.begin(); kernel != kernels.end(); ++kernel) { assert(kernel->second == NULL); cl_int ciErr; string name = "kernel_ocl_" + kernel->first.string(); kernel->second = clCreateKernel(program, name.c_str(), &ciErr); if(device->opencl_error(ciErr)) { error_msg = "Error getting kernel " + name + " from program " + program_name + ": " + clewErrorString(ciErr); return; } } loaded = true; } void OpenCLDeviceBase::OpenCLProgram::report_error() { if(loaded) return; cerr << error_msg << endl; if(!output_msg.empty()) { cerr << "OpenCL kernel build output for " << program_name << ":" << endl; cerr << output_msg << endl; } } cl_kernel OpenCLDeviceBase::OpenCLProgram::operator()() { assert(kernels.size() == 1); return kernels.begin()->second; } cl_kernel OpenCLDeviceBase::OpenCLProgram::operator()(ustring name) { assert(kernels.count(name)); return kernels[name]; } cl_device_type OpenCLInfo::device_type() { switch(DebugFlags().opencl.device_type) { case DebugFlags::OpenCL::DEVICE_NONE: return 0; case DebugFlags::OpenCL::DEVICE_ALL: return CL_DEVICE_TYPE_ALL; case DebugFlags::OpenCL::DEVICE_DEFAULT: return CL_DEVICE_TYPE_DEFAULT; case DebugFlags::OpenCL::DEVICE_CPU: return CL_DEVICE_TYPE_CPU; case DebugFlags::OpenCL::DEVICE_GPU: return CL_DEVICE_TYPE_GPU; case DebugFlags::OpenCL::DEVICE_ACCELERATOR: return CL_DEVICE_TYPE_ACCELERATOR; default: return CL_DEVICE_TYPE_ALL; } } bool OpenCLInfo::use_debug() { return DebugFlags().opencl.debug; } bool OpenCLInfo::kernel_use_advanced_shading(const string& platform) { /* keep this in sync with kernel_types.h! */ if(platform == "NVIDIA CUDA") return true; else if(platform == "Apple") return true; else if(platform == "AMD Accelerated Parallel Processing") return true; else if(platform == "Intel(R) OpenCL") return true; /* Make sure officially unsupported OpenCL platforms * does not set up to use advanced shading. */ return false; } bool OpenCLInfo::kernel_use_split(const string& platform_name, const cl_device_type device_type) { if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_SPLIT) { VLOG(1) << "Forcing split kernel to use."; return true; } if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_MEGA) { VLOG(1) << "Forcing mega kernel to use."; return false; } /* TODO(sergey): Replace string lookups with more enum-like API, * similar to device/vendor checks blender's gpu. */ if(platform_name == "AMD Accelerated Parallel Processing" && device_type == CL_DEVICE_TYPE_GPU) { return true; } return false; } bool OpenCLInfo::device_supported(const string& platform_name, const cl_device_id device_id) { cl_device_type device_type; clGetDeviceInfo(device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL); if(platform_name == "AMD Accelerated Parallel Processing" && device_type == CL_DEVICE_TYPE_GPU) { return true; } if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) { return true; } return false; } bool OpenCLInfo::platform_version_check(cl_platform_id platform, string *error) { const int req_major = 1, req_minor = 1; int major, minor; char version[256]; clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL); if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) { if(error != NULL) { *error = string_printf("OpenCL: failed to parse platform version string (%s).", version); } return false; } if(!((major == req_major && minor >= req_minor) || (major > req_major))) { if(error != NULL) { *error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor); } return false; } if(error != NULL) { *error = ""; } return true; } bool OpenCLInfo::device_version_check(cl_device_id device, string *error) { const int req_major = 1, req_minor = 1; int major, minor; char version[256]; clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL); if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) { if(error != NULL) { *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version); } return false; } if(!((major == req_major && minor >= req_minor) || (major > req_major))) { if(error != NULL) { *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor); } return false; } if(error != NULL) { *error = ""; } return true; } void OpenCLInfo::get_usable_devices(vector *usable_devices, bool force_all) { const bool force_all_platforms = force_all || (DebugFlags().opencl.kernel_type != DebugFlags::OpenCL::KERNEL_DEFAULT); const cl_device_type device_type = OpenCLInfo::device_type(); static bool first_time = true; #define FIRST_VLOG(severity) if(first_time) VLOG(severity) usable_devices->clear(); if(device_type == 0) { FIRST_VLOG(2) << "OpenCL devices are forced to be disabled."; first_time = false; return; } vector device_ids; cl_uint num_devices = 0; vector platform_ids; cl_uint num_platforms = 0; /* Get devices. */ if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || num_platforms == 0) { FIRST_VLOG(2) << "No OpenCL platforms were found."; first_time = false; return; } platform_ids.resize(num_platforms); if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) { FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver.."; first_time = false; return; } /* Devices are numbered consecutively across platforms. */ for(int platform = 0; platform < num_platforms; platform++) { cl_platform_id platform_id = platform_ids[platform]; char pname[256]; if(clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(pname), &pname, NULL) != CL_SUCCESS) { FIRST_VLOG(2) << "Failed to get platform name, ignoring."; continue; } string platform_name = pname; FIRST_VLOG(2) << "Enumerating devices for platform " << platform_name << "."; if(!platform_version_check(platform_id)) { FIRST_VLOG(2) << "Ignoring platform " << platform_name << " due to too old compiler version."; continue; } num_devices = 0; cl_int ciErr; if((ciErr = clGetDeviceIDs(platform_id, device_type, 0, NULL, &num_devices)) != CL_SUCCESS || num_devices == 0) { FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", failed to fetch number of devices: " << string(clewErrorString(ciErr)); continue; } device_ids.resize(num_devices); if(clGetDeviceIDs(platform_id, device_type, num_devices, &device_ids[0], NULL) != CL_SUCCESS) { FIRST_VLOG(2) << "Ignoring platform " << platform_name << ", failed to fetch devices list."; continue; } for(int num = 0; num < num_devices; num++) { cl_device_id device_id = device_ids[num]; char device_name[1024] = "\0"; if(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), &device_name, NULL) != CL_SUCCESS) { FIRST_VLOG(2) << "Failed to fetch device name, ignoring."; continue; } if(!device_version_check(device_id)) { FIRST_VLOG(2) << "Ignoring device " << device_name << " due to old compiler version."; continue; } if(force_all_platforms || device_supported(platform_name, device_id)) { cl_device_type device_type; if(clGetDeviceInfo(device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL) != CL_SUCCESS) { FIRST_VLOG(2) << "Ignoring device " << device_name << ", failed to fetch device type."; continue; } FIRST_VLOG(2) << "Adding new device " << device_name << "."; usable_devices->push_back(OpenCLPlatformDevice(platform_id, platform_name, device_id, device_type, device_name)); } else { FIRST_VLOG(2) << "Ignoring device " << device_name << ", not officially supported yet."; } } } first_time = false; } CCL_NAMESPACE_END #endif